├── .reuse
    └── dep5
├── Crosslanguage_model.pkl
├── JS_monolanguage_model.pkl
├── LICENSE
├── LICENSES
    └── Apache-2.0.txt
├── Labelled_Dataset.csv
├── Malicious_Packages_Discovered.csv
├── Py_monolanguage_model.pkl
├── README.md
└── scripts
    ├── Crosslanguage_DT_train_test.py
    ├── Crosslanguage_RF_train_test.py
    ├── Crosslanguage_XGBoost_train_test.py
    ├── JS_monolanguage_DT_train_test.py
    ├── JS_monolanguage_RF_train_test.py
    ├── JS_monolanguage_XGBoost_train_test.py
    ├── Py_monolanguage_DT_train_test.py
    ├── Py_monolanguage_RF_train_test.py
    ├── Py_monolanguage_XGBoost_train_test.py
    ├── feature_extraction
        ├── README.md
        ├── npm-test.py
        ├── npm_feature_extractor.py
        ├── pypi-test.py
        ├── pypi_feature_extractor.py
        ├── requirements.txt
        ├── resources
        │   └── dangerous_tokens.json
        └── utilities_functions.py
    ├── requirements.txt
    └── utilities_functions.py


/.reuse/dep5:
--------------------------------------------------------------------------------
 1 | Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/
 2 | Upstream-Name: cross-language-detection-artifacts
 3 | Upstream-Contact: ospo@sap.com
 4 | Source: <https://github.com/sap-samples/YOUR-REPO-NAME>
 5 | Disclaimer: The code in this project may include calls to APIs ("API Calls") of
 6 |  SAP or third-party products or services developed outside of this project
 7 |  ("External Products").
 8 |  "APIs" means application programming interfaces, as well as their respective
 9 |  specifications and implementing code that allows software to communicate with
10 |  other software.
11 |  API Calls to External Products are not licensed under the open source license
12 |  that governs this project. The use of such API Calls and related External
13 |  Products are subject to applicable additional agreements with the relevant
14 |  provider of the External Products. In no event shall the open source license
15 |  that governs this project grant any rights in or to any External Products,or
16 |  alter, expand or supersede any terms of the applicable additional agreements.
17 |  If you have a valid license agreement with SAP for the use of a particular SAP
18 |  External Product, then you may make use of any API Calls included in this
19 |  project's code for that SAP External Product, subject to the terms of such
20 |  license agreement. If you do not have a valid license agreement for the use of
21 |  a particular SAP External Product, then you may only make use of any API Calls
22 |  in this project for that SAP External Product for your internal, non-productive
23 |  and non-commercial test and evaluation of such API Calls. Nothing herein grants
24 |  you any rights to use or access any SAP External Product, or provide any third
25 |  parties the right to use of access any SAP External Product, through API Calls.
26 | 
27 | Files: Crosslanguage_model.pkl JS_monolanguage_model.pkl Py_monolanguage_model.pkl Labelled_Dataset.csv Malicious_Packages_Discovered.csv README.md scripts/*
28 | Copyright:  2023 SAP SE or an SAP affiliate company and cross-language-detection-artifacts contributors
29 | License: Apache-2.0


--------------------------------------------------------------------------------
/Crosslanguage_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SAP-samples/cross-language-detection-artifacts/ea937f5fbd8952cceb8a9b1bbc6f855fe7340f31/Crosslanguage_model.pkl


--------------------------------------------------------------------------------
/JS_monolanguage_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SAP-samples/cross-language-detection-artifacts/ea937f5fbd8952cceb8a9b1bbc6f855fe7340f31/JS_monolanguage_model.pkl


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/LICENSES/Apache-2.0.txt:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Malicious_Packages_Discovered.csv:
--------------------------------------------------------------------------------
 1 | package_repository,package_name,description,Obfuscation type
 2 | npm,aa-testkit-9.9.3,Data exfiltration,None
 3 | npm,ast-viewer-8.2.6,Data exfiltration,None
 4 | npm,babel-preset-geocaching-1.0.0,Data exfiltration,None
 5 | npm,bank-aa-9.9.5,Data exfiltration,None
 6 | npm,buffer-polyfill-9.9.3,Data exfiltration,None
 7 | npm,bundled-deps-poc-1.0.0,Rickrolling Attack,None
 8 | npm,cms-ui-components-721.78.2,Research Poc,AES-256
 9 | npm,cnscode-1.0.0,Research Poc,None
10 | npm,cnscodes-6.0.0,Research Poc,None
11 | npm,codemirror-6-getting-started-8.2.6,Data exfiltration,None
12 | npm,colorajs-1.1.5,Dropper,None
13 | npm,geocaching-express-account-middleware-1.0.0,Data exfiltration,None
14 | npm,geocomponents-1.0.0,Data exfiltration,None
15 | npm,hardhat-config-1.0.0,Data exfiltration,Custom
16 | npm,hardhat-modern-1.0.1,Data exfiltration,Custom
17 | npm,new-contracts-9.9.3,Data Exfiltration,None
18 | npm,nodexda-1.0.0,Dropper,None
19 | npm,pluginji-api1337-9.9.3,Data exfiltration,None
20 | npm,rdocumentation-app-8.2.6,Data exfiltration,None
21 | npm,rdocumentation-workers-8.2.6,Data exfiltration,None
22 | npm,react-dropzone-3-900.9.9,Research Poc,AES-256
23 | npm,reviewstack-0.1.0,Data exfiltration,None
24 | npm,richsite-1.0.0,Data exfiltration,None
25 | npm,rickroll-on-install-1.0.1,Rickrolling attack,None
26 | npm,shein-bbl-1.0.0,Data exfiltration,None
27 | npm,shein-components-1.0.0,Data exfiltration,None
28 | npm,ta123123-1.0.0,Data exfiltration,None
29 | npm,vital-neat-engine-0.0.4,Reverse shell,None
30 | npm,waffles-next-doc-site-8.2.6,Data exfiltration,None
31 | npm,waffles2-presentation-8.2.6,Data exfiltration,None
32 | npm,we-lib-login-tgp-69.69.69,Data exfiltration,None
33 | npm,zvulnerabilityscanner-1.0.0,Dropper,None
34 | npm,duc.components.cardshell-10.0.5,Data exfiltration,None
35 | npm,duc.fragments.spinner-10.0.1,Data exfiltration,None
36 | npm,duc.utils.conditional-wrapper-10.0.1,Data exfiltration,None
37 | npm,duc.utils.conditional-wrapper-10.0.2,Data exfiltration,None
38 | npm,excessively-safe-call-9.9.3,Data exfiltration,None
39 | npm,file-dep-poc-2-1.0.0,Rickrolling attack,None
40 | pypi,phaseOne-1.0.7,Keylogger,None
41 | pypi,phaseOne-2.2.7,Keylogger,None
42 | pypi,ci_common_utils-0.4.0,Data exfiltration,None
43 | pypi,pkg-with-extras-0.4.0,Data exfiltration,None
44 | pypi,python-kudu-0.4.0,Data exfiltration,None
45 | pypi,sonic-py-common-0.4.0,"Data exfiltration ",None
46 | pypi,cnscodes-1.2.4,Dropper,None
47 | pypi,debricked-test-0.5,Reverse Shell,None
48 | pypi,Deepmountains_rce-0.0.1,Reverse Shell,None
49 | pypi,Deepmountains_wrce-0.0.1,Reverse Shell,None
50 | pypi,dlcsord-1.0.3,Dropper,Custom
51 | pypi,pycbytes-1.0.0,Dropper,Custom
52 | pypi,drawtime-0.9,Rickrolling Attack,None
53 | pypi,MuktesitTABAN-0.0.16,Data exfiltration,PyArmor
54 | pypi,tshawn_lrce-0.0.1,Reverse Shell,None
55 | pypi,tshawn_wrce-0.0.2,Reverse Shell,None
56 | pypi,rumihelling-0.0.1,Dropper,None
57 | pypi,rumihell-0.0.1,Dropper,None
58 | pypi,teleport-client-100.1.0,Data exfiltration,None


--------------------------------------------------------------------------------
/Py_monolanguage_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SAP-samples/cross-language-detection-artifacts/ea937f5fbd8952cceb8a9b1bbc6f855fe7340f31/Py_monolanguage_model.pkl


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # On the Feasibility of Cross-Language Detection of Malicious Packages in npm and PyPI  -  Paper Artifacts
  2 | <!-- Please include descriptive title -->
  3 | 
  4 | <!--- Register repository https://api.reuse.software/register, then add REUSE badge:
  5 | [![REUSE status](https://api.reuse.software/badge/github.com/SAP-samples/REPO-NAME)](https://api.reuse.software/info/github.com/SAP-samples/REPO-NAME)
  6 | -->
  7 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE.txt) [![REUSE status](https://api.reuse.software/badge/github.com/SAP-samples/cross-language-detection-artifacts)](https://api.reuse.software/info/github.com/SAP-samples/cross-language-detection-artifacts)
  8 | 
  9 | ## Description
 10 | <!-- Please include SEO-friendly description -->
 11 | 
 12 | This supplementary material constitutes the artifact for our paper titled "On the Feasibility of Cross-Language Detection of Malicious Packages in npm and PyPI".
 13 | Specifically:
 14 | - `Labelled_Dataset.csv` (1.8MB in size): 
 15 |   - This file contains the labelled dataset used in our evaluation process. It houses the data employed for evaluating the features, assessing the models, and training the classifiers. Such dataset can be used to gain further insights into how features were selected, evaluate the models' performance, and comprehend the training process.
 16 | - `.pkl` Files (cross-language (125.9 kB), JavaScript mono-language (73.7 kB), and Python mono-language (110.1 kB)): 
 17 |   - These files, saved in pickle format, encapsulate the best-performing classifiers that emerged from our models' evaluation. They can be imported into Python using scikit-learn library. This feature enables you to use these classifiers for classification tasks.
 18 | - `Malicious_Packages_Discovered.csv`: 
 19 |   - This file comprises a comprehensive list of malicious packages that were successfully detected during our real-world experiment. It details their characteristics, behavior types, and obfuscation techniques.
 20 | - `scripts/*`:
 21 |   - This folder contains various scripts including those to reproduce the evaluation and training of the models presented in the paper
 22 | 
 23 | ## Requirements and Run
 24 | 
 25 | (1) Install the dependencies via:
 26 | 
 27 | ```
 28 | $ pip install -r ./scripts/requirements.txt
 29 | ```
 30 | 
 31 | (2) Run the script according to the model you want to reproduce (i.e., cross-language or mono-language). For example, in the case of the cross-language XGBoost model you can run from the main folder:
 32 | 
 33 | ```
 34 | $ python ./scripts/Crosslanguage_XGBoost_train_test.py
 35 | ```
 36 | 
 37 | The produced classifier can be then imported in Python using the joblib function `load()`, for example:
 38 | 
 39 | ```
 40 | classifier_XGBoost = joblib.load(model_path)
 41 | ```
 42 | ***************************************************************************
 43 | ## Features Explaination
 44 | 
 45 | `Labelled_Dataset.csv` is a labeled dataset containing packages from two distinct public repositories: NPM and PyPI. It addresses the issue of classifying malicious packages, taking into account the imbalance problem.
 46 | 
 47 | In this scenario, we assume that the estimated percentage of malicious packages among all packages is approximately 10%.
 48 | 
 49 | It follows the explaination of the columns in the CSV file.
 50 | 
 51 | #### Package information
 52 | 
 53 | - `Malicious`: encodes the target value, i.e., wether a package is benign (value of 0)  or malicious (value of 1)	
 54 | - `Package Repository`: specifies from which package repository the package comes from (i.e., NPM or PyPI)
 55 | - `Package Name`: specifies the package name
 56 |   
 57 | #### Features extracted from the source code files (.js, .py )
 58 | - `Number of Words in source code` : count of words in the source code files
 59 | - `Number of lines in source code` : count of lines in the source code files
 60 | - `Number of sospicious token in source code` : count of suspicious words (e.g., bash commands, path to sensitive files) in source code files
 61 | - `Number of URLs in source code`
 62 | - `Number of base64 chunks in source code` : number of detected valid base64 strings in source code
 63 | - `Number of IP adress in source code`
 64 | - `bracker ratio mean` : Mean of the ratio (no. of square brackets/byte size) among source code files   
 65 | - `bracker ratio std`: Standard Deviation of the ratio (no. of square brackets/byte size) among source code files
 66 | - `bracker ratio max` : Maximum of the ratio (no. of square brackets/byte size) among source code files 
 67 | - `bracker ratio q3`: Third quartile of the ratio (no. of square brackets/byte size) among source code files  
 68 | - `eq ratio mean` : Mean of the ratio (equal signs/byte size) among source code files   
 69 | - `eq ratio std` : Standard Deviation of the ratio (equal signs/byte size) among source code files
 70 | - `eq ratio max` : Maximum of the ratio (equal signs/byte size) among source code files 
 71 | - `eq ratio q3` : Third quartile of the ratio (equal signs/byte size) among source code files
 72 | - `plus ratio mean` : Mean of the ratio (plus signs/byte size) among source code files   
 73 | - `plus ratio std` : Standard Deviation of the ratio (plus signs/byte size) among source code files
 74 | - `plus ratio max` : Maximum of the ratio (plus signs/byte size) among source code files 
 75 | - `plus ratio q3` : Third quartile of the ratio (plus signs/byte size) among source code files 
 76 | - `shannon mean ID source code` : Shannon Entropy's mean computed on identifiers, after applying the generalization language
 77 | - `shannon std ID source code` : Shannon Entropy's standard deviation computed on identifiers, after applying the generalization language
 78 | - `shannon max ID source code` : Shannon Entropy's maximum value computed on identifiers, after applying the generalization language
 79 | - `shannon q3 ID source code` : Shannon Entropy's third quartile computed on identifiers, after applying the generalization language
 80 | - `shannon mean string source code` : Shannon Entropy's mean computed on strings, after applying the generalization language
 81 | - `shannon std string source code` : Shannon Entropy's standard deviation computed on strings, after applying the generalization language
 82 | - `shannon max string source code` : Shannon Entropy's maximum valuecomputed on, after applying the generalization language
 83 | - `shannon q3 string source code` : Shannon Entropy's third quartile of tokenized strings, after applying the generalization language
 84 | - `homogeneous identifiers in source code` : Number of homogeneous identifiers, i.e., identifiers having all characters equal after transforming them through the generalization language
 85 | - `homogeneous strings in source code` : Number of homogeneous strings, i.e. strings having all characters equal after transforming them through the generalization language
 86 | - `heterogeneous identifiers in source code` : Number of heterogeneous identifiers, i.e., identifiers with more than one symbol after transforming them using the generalization language equal 
 87 | - `heterogeneous strings in source code` : Number of heterogeneous strings, i.e., strings with more than one symbol after transforming them using the generalization language equal 
 88 | #### Metadata file (NPM:package.json, Pypi: setup.py )
 89 | The following features are extracted from the 'metadata' file in the packages, such as the `package.json` file for NPM and the `setup.py` file for PyPI:
 90 | - `Number of Words in metadata` : count of words in the metadata files (i.e., `package.json` file for NPM and the `setup.py` file for PyPI)
 91 | - `Number of lines in metadata` : count of lines in the metadata files (i.e., `package.json` file for NPM and the `setup.py` file for PyPI)
 92 | - `Number of sospicious token in metadata` : count of suspicious words (e.g., bash commands, path to sensitive files) in the metadata files (i.e., `package.json` file for NPM and the `setup.py` file for PyPI)
 93 | - `Number of URLs in metadata`
 94 | - `Number of base64 chunks in metadata` : number of detected valid base64 strings the metadata files (i.e., `package.json` file for NPM and the `setup.py` file for PyPI)
 95 | - `Number of IP adress in metadata`
 96 | - `presence of installation script`: boolean for the presence of installation script (Pypi: install script - NPM: presence of keys `postinstall`, `preinstall`, `install`)
 97 | - `shannon mean ID metadata` : Shannon Entropy's mean computed on identifiers, after applying the generalization language
 98 | - `shannon std ID metadata` : Shannon Entropy's standard deviation computed on identifiers, after applying the generalization language
 99 | - `shannon max ID metadata` : Shannon Entropy's maximum value computed on identifiers, after applying the generalization language
100 | - `shannon q3 ID metadata` : Shannon Entropy's third quartile computed on identifiers, after applying the generalization language
101 | - `shannon mean metadata` : Shannon Entropy's mean computed on strings, after applying the generalization language
102 | - `shannon std string metadata` : Shannon Entropy's standard deviation computed on strings, after applying the generalization language
103 | - `shannon max string metadata` : Shannon Entropy's maximum valuecomputed on, after applying the generalization language
104 | - `shannon q3 string metadata` : Shannon Entropy's third quartile of tokenized strings, after applying the generalization language
105 | - `homogeneous identifiers in metadata` : Number of homogeneous identifiers, i.e., identifiers having all characters equal after transforming them through the generalization language
106 | - `homogeneous strings in metadata` : Number of homogeneous strings, i.e. strings having all characters equal after transforming them through the generalization language
107 | - `heterogeneous identifiers in metadata` : Number of heterogeneous identifiers, i.e., identifiers with more than one symbol after transforming them using the generalization language equal 
108 | - `heterogeneous strings in metadata` : Number of heterogeneous strings, i.e., strings with more than one symbol after transforming them using the generalization language equal 
109 | #### Structural features of the package
110 | The following features count the number of files per selected extensions:  
111 | 
112 | ```'bat', 'bz2', 'c', 'cert', 'conf' ,'cpp' ,'crt', 'css', 'csv', 'deb' ,'erb', 'gemspec', 'gif', 'gz', 'h', 'html', 'ico' ,'ini' ,'jar', 'java', 'jpg', 'js', 'json', 'key' ,'m4v' ,'markdown' ,'md' ,'pdf', 'pem', 'png', 'ps', 'py', 'rb', 'rpm', 'rst','sh' ,'svg', 'toml', 'ttf', 'txt','xml', 'yaml', 'yml', 'eot', 'exe', 'jpeg', 'properties', 'sql', 'swf', 'tar', 'woff', 'woff2', 'aac','bmp', 'cfg' ,'dcm', 'dll', 'doc', 'flac','flv', 'ipynb', 'm4a', 'mid', 'mkv', 'mp3', 'mp4', 'mpg', 'ogg','otf', 'pickle', 'pkl' ,'psd', 'pxd' ,'pxi', 'pyc', 'pyx', 'r', 'rtf', 'so', 'sqlite' ,'tif', 'tp', 'wav', 'webp' ,'whl', 'xcf', 'xz', 'zip' ,'mov' ,'wasm', 'webm'.```
113 | 
114 | **************************************************
115 | 
116 | ## Time and Space Cost 
117 | 
118 | The current cost values to train our models have been computed in the following configuration:
119 | 
120 | `MacOS 13.5.1; CPU: 2 GHz Quad-Core Intel Core i5; RAM: 16 GB 3733 MHz LPDDR4X`
121 | 
122 | In addition, such costs are computed for the whole process of evaluating the best hyperparameters and training the models.
123 | 
124 | For XGBoost we have the following estimations:
125 | - Cross-language model: 
126 |   - Estimated train time: `271.04s`
127 |   - Estimated space cost: `64.08 MB`
128 | - Mono-language model (JavaScript):
129 |   - Estimated train time: `198.24s`
130 |   - Estimated space cost: `57.06 MB`
131 | - Mono-language model (Python):
132 |   - Estimated train time: `195.50s`
133 |   - Estimated space cost: `60.98 MB`
134 | 
135 | For Decision Tree (DT) we have the following estimations:
136 | - Cross-language model: 
137 |   - Estimated train time: `86.76s`
138 |   - Estimated space cost: `118.24 MB`
139 | - Mono-language model (JavaScript):
140 |   - Estimated train time: `51.01s`
141 |   - Estimated space cost: `108.03 MB`
142 | - Mono-language model (Python):
143 |   - Estimated train time: `46.86s`
144 |   - Estimated space cost: `125.35 MB`
145 | 
146 | For Random Forest (RF) we have the following estimations:
147 | - Cross-language model: 
148 |   - Estimated train time: `696.67s`
149 |   - Estimated space cost: `193.31 MB`
150 | - Mono-language model (JavaScript):
151 |   - Estimated train time: `544.89s`
152 |   - Estimated space cost: `125.66 MB`
153 | - Mono-language model (Python):
154 |   - Estimated train time: `550.65s`
155 |   - Estimated space cost: `122.81 MB`
156 | 
157 | 
158 | 
159 | ## How to obtain support
160 | [Create an issue](https://github.com/SAP-samples/<repository-name>/issues) in this repository if you find a bug or have questions about the content.
161 |  
162 | For additional support, [ask a question in SAP Community](https://answers.sap.com/questions/ask.html).
163 | 
164 | ## Contributing
165 | If you wish to contribute code, offer fixes or improvements, please send a pull request. Due to legal reasons, contributors will be asked to accept a DCO when they create the first pull request to this project. This happens in an automated fashion during the submission process. SAP uses [the standard DCO text of the Linux Foundation](https://developercertificate.org/).
166 | 
167 | ## License
168 | Copyright (c) 2023 SAP SE or an SAP affiliate company. All rights reserved. This project is licensed under the Apache Software License, version 2.0 except as noted otherwise in the [LICENSE](LICENSE) file.
169 | 


--------------------------------------------------------------------------------
/scripts/Crosslanguage_DT_train_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import joblib
 3 | import datetime
 4 | from utilities_functions import *
 5 | import time
 6 | import psutil
 7 | data=pd.read_csv('./Labelled_Dataset.csv',sep=',')
 8 | # Train and evaluate performances
 9 | 
10 | 
11 | start_time = time.time()
12 | process = psutil.Process()
13 | initial_memory = process.memory_info().rss / (1024 ** 2)  # Initial memory usage in MB
14 | performance, hyperparams=evaluation_decision_tree(data)
15 | final_memory = process.memory_info().rss / (1024 ** 2)    # Final memory usage in MB
16 | memory_usage = final_memory - initial_memory
17 | print(f"Memory usage: {memory_usage} MB")
18 | print("--- %s seconds ---" % (time.time() - start_time))
19 | 
20 | print(hyperparams)
21 | 
22 | print("Test Precision       : ",round(performance.iloc[-16,0],2),"±",round(performance.iloc[-16,1],2),'%')
23 | print("Test Recall         : ",round(performance.iloc[-15,0],2),"±",round(performance.iloc[-15,1],2),'%')
24 | print("Test F1-Score  : ",round(performance.iloc[-14,0],2),"±",round(performance.iloc[-14,1],2),'%')
25 | print("Test Accuracy  : ",round(performance.iloc[-13,0],2),"±",round(performance.iloc[-13,1],2),'%')
26 | print("Test False Positive (benign packages classified as malicious one) : ",round(performance.iloc[-12,0],0),"±",round(performance.iloc[-12,1],0),'%')
27 | print("Test False Negative (malicious packages classified as benign): ",round(performance.iloc[-11,0],0),"±",round(performance.iloc[-11,1],0),'%')
28 | print("Test True Negative: ",round(performance.iloc[-10,0],0),"±",round(performance.iloc[-10,1],0),'%')
29 | print("Test True Positive: ",round(performance.iloc[-9,0],0),"±",round(performance.iloc[-9,1],0),'%')
30 | print("Test Precision NPM       : ",round(performance.iloc[-8,0],2),"±",round(performance.iloc[-8,1],2),'%')
31 | print("Test Recall NPM        : ",round(performance.iloc[-7,0],2),"±",round(performance.iloc[-7,1],2),'%')
32 | print("Test F1-Score NPM : ",round(performance.iloc[-6,0],2),"±",round(performance.iloc[-6,1],2),'%')
33 | print("Test Accuracy NPM : ",round(performance.iloc[-5,0],2),"±",round(performance.iloc[-5,1],2),'%')
34 | print("Test Precision Pypi       : ",round(performance.iloc[-4,0],2),"±",round(performance.iloc[-4,1],2),'%')
35 | print("Test Recall Pypi        : ",round(performance.iloc[-3,0],2),"±",round(performance.iloc[-3,1],2),'%')
36 | print("Test F1-Score Pypi : ",round(performance.iloc[-2,0],2),"±",round(performance.iloc[-2,1],2),'%')
37 | print("Test Accuracy Pypi  : ",round(performance.iloc[-1,0],2),"±",round(performance.iloc[-1,1],2),'%')
38 | 
39 | # Now train final model and dump the model
40 | 
41 | X = data.drop(labels=['Package Repository','Malicious','Package Name'],axis=1).values
42 | Y = data['Malicious'].astype('int').values
43 | 
44 | classifier_DT =DecisionTreeClassifier(random_state=123,criterion=hyperparams['criterion'],max_depth=hyperparams['max_depth'],max_features=hyperparams['max_features'],min_samples_leaf=hyperparams['min_sample_leaf'],min_samples_split=hyperparams['min_sample_split'])
45 | classifier_DT.fit(X=X, y=Y)
46 | 
47 | 
48 |  
49 | # Get current date and time
50 | dt = str(datetime.datetime.now()).split('.')[0].replace(' ','-').replace(":",'_')
51 | # save the file in the current work directory 
52 | joblib_file='./CrossLanguage_DT_'+dt+'.pkl'
53 | joblib.dump(classifier_DT,joblib_file)


--------------------------------------------------------------------------------
/scripts/Crosslanguage_RF_train_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import joblib
 3 | import datetime
 4 | from utilities_functions import *
 5 | import time
 6 | import psutil
 7 | data=pd.read_csv('./Labelled_Dataset.csv',sep=',')
 8 | # Train and evaluate performances
 9 | 
10 | start_time = time.time()
11 | process = psutil.Process()
12 | initial_memory = process.memory_info().rss / (1024 ** 2)  # Initial memory usage in MB
13 | performance, hyperparams=evaluation_random_forest(data)
14 | final_memory = process.memory_info().rss / (1024 ** 2)    # Final memory usage in MB
15 | memory_usage = final_memory - initial_memory
16 | print(f"Memory usage: {memory_usage} MB")
17 | print("--- %s seconds ---" % (time.time() - start_time))
18 | print(hyperparams)
19 | 
20 | print("Test Precision       : ",round(performance.iloc[-16,0],2),"±",round(performance.iloc[-16,1],2),'%')
21 | print("Test Recall         : ",round(performance.iloc[-15,0],2),"±",round(performance.iloc[-15,1],2),'%')
22 | print("Test F1-Score  : ",round(performance.iloc[-14,0],2),"±",round(performance.iloc[-14,1],2),'%')
23 | print("Test Accuracy  : ",round(performance.iloc[-13,0],2),"±",round(performance.iloc[-13,1],2),'%')
24 | print("Test False Positive (benign packages classified as malicious one) : ",round(performance.iloc[-12,0],0),"±",round(performance.iloc[-12,1],0),'%')
25 | print("Test False Negative (malicious packages classified as benign): ",round(performance.iloc[-11,0],0),"±",round(performance.iloc[-11,1],0),'%')
26 | print("Test True Negative: ",round(performance.iloc[-10,0],0),"±",round(performance.iloc[-10,1],0),'%')
27 | print("Test True Positive: ",round(performance.iloc[-9,0],0),"±",round(performance.iloc[-9,1],0),'%')
28 | print("Test Precision NPM       : ",round(performance.iloc[-8,0],2),"±",round(performance.iloc[-8,1],2),'%')
29 | print("Test Recall NPM        : ",round(performance.iloc[-7,0],2),"±",round(performance.iloc[-7,1],2),'%')
30 | print("Test F1-Score NPM : ",round(performance.iloc[-6,0],2),"±",round(performance.iloc[-6,1],2),'%')
31 | print("Test Accuracy NPM : ",round(performance.iloc[-5,0],2),"±",round(performance.iloc[-5,1],2),'%')
32 | print("Test Precision Pypi       : ",round(performance.iloc[-4,0],2),"±",round(performance.iloc[-4,1],2),'%')
33 | print("Test Recall Pypi        : ",round(performance.iloc[-3,0],2),"±",round(performance.iloc[-3,1],2),'%')
34 | print("Test F1-Score Pypi : ",round(performance.iloc[-2,0],2),"±",round(performance.iloc[-2,1],2),'%')
35 | print("Test Accuracy Pypi  : ",round(performance.iloc[-1,0],2),"±",round(performance.iloc[-1,1],2),'%')
36 | 
37 | # Now train final model and dump the model
38 | 
39 | X = data.drop(labels=['Package Repository','Malicious','Package Name'],axis=1).values
40 | Y = data['Malicious'].astype('int').values
41 | 
42 | classifier_RF =RandomForestClassifier(random_state=123,criterion=hyperparams['criterion'],n_estimators=hyperparams['n_estimators'],max_depth=hyperparams['max_depth'],max_features=hyperparams['max_features'],min_samples_leaf=hyperparams['min_sample_leaf'],min_samples_split=hyperparams['min_sample_split'],max_samples=hyperparams['max_samples'])
43 | classifier_RF.fit(X=X, y=Y)
44 | 
45 | 
46 |  
47 | # Get current date and time
48 | dt = str(datetime.datetime.now()).split('.')[0].replace(' ','-').replace(":",'_')
49 | # save the file in the current work directory 
50 | joblib_file='./CrossLanguage_RF_'+dt+'.pkl'
51 | joblib.dump(classifier_RF,joblib_file)


--------------------------------------------------------------------------------
/scripts/Crosslanguage_XGBoost_train_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import joblib
 3 | import datetime
 4 | from utilities_functions import *
 5 | import psutil
 6 | 
 7 | data=pd.read_csv('./Labelled_Dataset.csv',sep=',')
 8 | # Train and evaluate performances
 9 | 
10 | import time
11 | start_time = time.time()
12 | process = psutil.Process()
13 | initial_memory = process.memory_info().rss / (1024 ** 2)  # Initial memory usage in MB
14 | performance, hyperparams=evaluation_NPM_Pypi_xgb(data)
15 | final_memory = process.memory_info().rss / (1024 ** 2)    # Final memory usage in MB
16 | memory_usage = final_memory - initial_memory
17 | print(f"Memory usage: {memory_usage} MB")
18 | print("--- %s seconds ---" % (start_time - time.time()))
19 |     
20 | 
21 | print(hyperparams)
22 | 
23 | print("Test Precision       : ",round(performance.iloc[-16,0],2),"±",round(performance.iloc[-16,1],2),'%')
24 | print("Test Recall         : ",round(performance.iloc[-15,0],2),"±",round(performance.iloc[-15,1],2),'%')
25 | print("Test F1-Score  : ",round(performance.iloc[-14,0],2),"±",round(performance.iloc[-14,1],2),'%')
26 | print("Test Accuracy  : ",round(performance.iloc[-13,0],2),"±",round(performance.iloc[-13,1],2),'%')
27 | print("Test False Positive (benign packages classified as malicious one) : ",round(performance.iloc[-12,0],0),"±",round(performance.iloc[-12,1],0),'%')
28 | print("Test False Negative (malicious packages classified as benign): ",round(performance.iloc[-11,0],0),"±",round(performance.iloc[-11,1],0),'%')
29 | print("Test True Negative: ",round(performance.iloc[-10,0],0),"±",round(performance.iloc[-10,1],0),'%')
30 | print("Test True Positive: ",round(performance.iloc[-9,0],0),"±",round(performance.iloc[-9,1],0),'%')
31 | print("Test Precision NPM       : ",round(performance.iloc[-8,0],2),"±",round(performance.iloc[-8,1],2),'%')
32 | print("Test Recall NPM        : ",round(performance.iloc[-7,0],2),"±",round(performance.iloc[-7,1],2),'%')
33 | print("Test F1-Score NPM : ",round(performance.iloc[-6,0],2),"±",round(performance.iloc[-6,1],2),'%')
34 | print("Test Accuracy NPM : ",round(performance.iloc[-5,0],2),"±",round(performance.iloc[-5,1],2),'%')
35 | print("Test Precision Pypi       : ",round(performance.iloc[-4,0],2),"±",round(performance.iloc[-4,1],2),'%')
36 | print("Test Recall Pypi        : ",round(performance.iloc[-3,0],2),"±",round(performance.iloc[-3,1],2),'%')
37 | print("Test F1-Score Pypi : ",round(performance.iloc[-2,0],2),"±",round(performance.iloc[-2,1],2),'%')
38 | print("Test Accuracy Pypi  : ",round(performance.iloc[-1,0],2),"±",round(performance.iloc[-1,1],2),'%')
39 | 
40 | # Now train final model and dump the model
41 | 
42 | X = data.drop(labels=['Package Repository','Malicious','Package Name'],axis=1).values
43 | Y = data['Malicious'].astype('int').values
44 | 
45 | classifier_XGBoost =xgb.XGBClassifier(random_state=123,n_estimators=hyperparams['n_estimators'],max_depth=hyperparams['max_depth'],gamma=hyperparams['gamma'],eta=hyperparams['eta'],colsample_bytree=hyperparams['colsample_bytree'],min_child_weight=hyperparams['min_child_weight'])
46 | classifier_XGBoost.fit(X=X, y=Y)
47 | 
48 | 
49 |  
50 | # Get current date and time
51 | dt = str(datetime.datetime.now()).split('.')[0].replace(' ','-').replace(":",'_')
52 | # save the file in the current work directory 
53 | joblib_file='./CrossLanguage_XGB_'+dt+'.pkl'
54 | joblib.dump(classifier_XGBoost,joblib_file)


--------------------------------------------------------------------------------
/scripts/JS_monolanguage_DT_train_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import joblib
 3 | import datetime
 4 | from utilities_functions import *
 5 | import time, psutil
 6 | 
 7 | data=pd.read_csv('./Labelled_Dataset.csv',sep=',')
 8 | 
 9 | data = data[data['Package Repository']=='NPM']
10 | 
11 | 
12 | 
13 | # Train and evaluate performances
14 | 
15 | start_time = time.time()
16 | process = psutil.Process()
17 | initial_memory = process.memory_info().rss / (1024 ** 2)  # Initial memory usage in MB
18 | performance, hyperparams=evaluation_decision_tree(data)
19 | final_memory = process.memory_info().rss / (1024 ** 2)    # Final memory usage in MB
20 | memory_usage = final_memory - initial_memory
21 | print(f"Memory usage: {memory_usage} MB")
22 | print("--- %s seconds ---" % (time.time() - start_time))
23 | 
24 | print(hyperparams)
25 | 
26 | 
27 | print("Test Precision       : ",round(performance.iloc[-16,0],2),"±",round(performance.iloc[-16,1],2),'%')
28 | print("Test Recall         : ",round(performance.iloc[-15,0],2),"±",round(performance.iloc[-15,1],2),'%')
29 | print("Test F1-Score  : ",round(performance.iloc[-14,0],2),"±",round(performance.iloc[-14,1],2),'%')
30 | print("Test Accuracy  : ",round(performance.iloc[-13,0],2),"±",round(performance.iloc[-13,1],2),'%')
31 | print("Test False Positive (benign packages classified as malicious one) : ",round(performance.iloc[-12,0],0),"±",round(performance.iloc[-12,1],0),'%')
32 | print("Test False Negative (malicious packages classified as benign): ",round(performance.iloc[-11,0],0),"±",round(performance.iloc[-11,1],0),'%')
33 | print("Test True Negative: ",round(performance.iloc[-10,0],0),"±",round(performance.iloc[-10,1],0),'%')
34 | print("Test True Positive: ",round(performance.iloc[-9,0],0),"±",round(performance.iloc[-9,1],0),'%')
35 | 
36 | 
37 | # Now train final model and dump the model
38 | 
39 | X = data.drop(labels=['Package Repository','Malicious','Package Name'],axis=1).values
40 | Y = data['Malicious'].astype('int').values
41 | 
42 | classifier_DT =DecisionTreeClassifier(random_state=123,criterion=hyperparams['criterion'],max_depth=hyperparams['max_depth'],max_features=hyperparams['max_features'],min_samples_leaf=hyperparams['min_sample_leaf'],min_samples_split=hyperparams['min_sample_split'])
43 | classifier_DT.fit(X=X, y=Y)
44 | 
45 | 
46 | 
47 |  
48 | # Get current date and time
49 | dt = str(datetime.datetime.now()).split('.')[0].replace(' ','-').replace(":",'_')
50 | # save the file in the current work directory 
51 | joblib_file='./JS_monolanguage_DT_'+dt+'.pkl'
52 | joblib.dump(classifier_DT,joblib_file)


--------------------------------------------------------------------------------
/scripts/JS_monolanguage_RF_train_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import joblib
 3 | import datetime
 4 | from utilities_functions import *
 5 | import time, psutil
 6 | data=pd.read_csv('./Labelled_Dataset.csv',sep=',')
 7 | 
 8 | data = data[data['Package Repository']=='NPM']
 9 | 
10 | 
11 | 
12 | # Train and evaluate performances
13 | start_time = time.time()
14 | process = psutil.Process()
15 | initial_memory = process.memory_info().rss / (1024 ** 2)  # Initial memory usage in MB
16 | performance, hyperparams=evaluation_random_forest(data)
17 | final_memory = process.memory_info().rss / (1024 ** 2)    # Final memory usage in MB
18 | memory_usage = final_memory - initial_memory
19 | print(f"Memory usage: {memory_usage} MB")
20 | print("--- %s seconds ---" % (time.time() - start_time))
21 | print(hyperparams)
22 | 
23 | print(hyperparams)
24 | 
25 | 
26 | print("Test Precision       : ",round(performance.iloc[-16,0],2),"±",round(performance.iloc[-16,1],2),'%')
27 | print("Test Recall         : ",round(performance.iloc[-15,0],2),"±",round(performance.iloc[-15,1],2),'%')
28 | print("Test F1-Score  : ",round(performance.iloc[-14,0],2),"±",round(performance.iloc[-14,1],2),'%')
29 | print("Test Accuracy  : ",round(performance.iloc[-13,0],2),"±",round(performance.iloc[-13,1],2),'%')
30 | print("Test False Positive (benign packages classified as malicious one) : ",round(performance.iloc[-12,0],0),"±",round(performance.iloc[-12,1],0),'%')
31 | print("Test False Negative (malicious packages classified as benign): ",round(performance.iloc[-11,0],0),"±",round(performance.iloc[-11,1],0),'%')
32 | print("Test True Negative: ",round(performance.iloc[-10,0],0),"±",round(performance.iloc[-10,1],0),'%')
33 | print("Test True Positive: ",round(performance.iloc[-9,0],0),"±",round(performance.iloc[-9,1],0),'%')
34 | 
35 | 
36 | # Now train final model and dump the model
37 | 
38 | X = data.drop(labels=['Package Repository','Malicious','Package Name'],axis=1).values
39 | Y = data['Malicious'].astype('int').values
40 | 
41 | classifier_RF =RandomForestClassifier(random_state=123,criterion=hyperparams['criterion'],n_estimators=hyperparams['n_estimators'],max_depth=hyperparams['max_depth'],max_features=hyperparams['max_features'],min_samples_leaf=hyperparams['min_sample_leaf'],min_samples_split=hyperparams['min_sample_split'],max_samples=hyperparams['max_samples'])
42 | classifier_RF.fit(X=X, y=Y)
43 | 
44 | 
45 | 
46 |  
47 | # Get current date and time
48 | dt = str(datetime.datetime.now()).split('.')[0].replace(' ','-').replace(":",'_')
49 | # save the file in the current work directory 
50 | joblib_file='./JS_monolanguage_RF_'+dt+'.pkl'
51 | joblib.dump(classifier_RF,joblib_file)


--------------------------------------------------------------------------------
/scripts/JS_monolanguage_XGBoost_train_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import joblib
 3 | import datetime
 4 | from utilities_functions import *
 5 | import psutil
 6 | data=pd.read_csv('./Labelled_Dataset.csv',sep=',')
 7 | 
 8 | data = data[data['Package Repository']=='NPM']
 9 | 
10 | 
11 | 
12 | # Train and evaluate performances
13 | import time
14 | start_time = time.time()
15 | process = psutil.Process()
16 | initial_memory = process.memory_info().rss / (1024 ** 2)  # Initial memory usage in MB
17 | performance, hyperparams=evaluation_NPM_Pypi_xgb(data)
18 | final_memory = process.memory_info().rss / (1024 ** 2)    # Final memory usage in MB
19 | memory_usage = final_memory - initial_memory
20 | print(f"Memory usage: {memory_usage} MB")
21 | print("--- %s seconds ---" % (start_time - time.time()))
22 | 
23 | print(hyperparams)
24 | 
25 | 
26 | print("Test Precision       : ",round(performance.iloc[-16,0],2),"±",round(performance.iloc[-16,1],2),'%')
27 | print("Test Recall         : ",round(performance.iloc[-15,0],2),"±",round(performance.iloc[-15,1],2),'%')
28 | print("Test F1-Score  : ",round(performance.iloc[-14,0],2),"±",round(performance.iloc[-14,1],2),'%')
29 | print("Test Accuracy  : ",round(performance.iloc[-13,0],2),"±",round(performance.iloc[-13,1],2),'%')
30 | print("Test False Positive (benign packages classified as malicious one) : ",round(performance.iloc[-12,0],0),"±",round(performance.iloc[-12,1],0),'%')
31 | print("Test False Negative (malicious packages classified as benign): ",round(performance.iloc[-11,0],0),"±",round(performance.iloc[-11,1],0),'%')
32 | print("Test True Negative: ",round(performance.iloc[-10,0],0),"±",round(performance.iloc[-10,1],0),'%')
33 | print("Test True Positive: ",round(performance.iloc[-9,0],0),"±",round(performance.iloc[-9,1],0),'%')
34 | 
35 | 
36 | # Now train final model and dump the model
37 | 
38 | X = data.drop(labels=['Package Repository','Malicious','Package Name'],axis=1).values
39 | Y = data['Malicious'].astype('int').values
40 | 
41 | classifier_XGBoost =xgb.XGBClassifier(random_state=123,n_estimators=hyperparams['n_estimators'],max_depth=hyperparams['max_depth'],gamma=hyperparams['gamma'],eta=hyperparams['eta'],colsample_bytree=hyperparams['colsample_bytree'],min_child_weight=hyperparams['min_child_weight'])
42 | classifier_XGBoost.fit(X=X, y=Y)
43 | 
44 | 
45 | 
46 |  
47 | # Get current date and time
48 | dt = str(datetime.datetime.now()).split('.')[0].replace(' ','-').replace(":",'_')
49 | # save the file in the current work directory 
50 | joblib_file='./JS_monolanguage_XGB_'+dt+'.pkl'
51 | joblib.dump(classifier_XGBoost,joblib_file)


--------------------------------------------------------------------------------
/scripts/Py_monolanguage_DT_train_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import joblib
 3 | import datetime
 4 | from utilities_functions import *
 5 | import time, psutil
 6 | 
 7 | data=pd.read_csv('./Labelled_Dataset.csv',sep=',')
 8 | 
 9 | data = data[data['Package Repository']=='PyPI']
10 | 
11 | 
12 | 
13 | # Train and evaluate performances
14 | start_time = time.time()
15 | process = psutil.Process()
16 | initial_memory = process.memory_info().rss / (1024 ** 2)  # Initial memory usage in MB
17 | performance, hyperparams=evaluation_decision_tree(data)
18 | final_memory = process.memory_info().rss / (1024 ** 2)    # Final memory usage in MB
19 | memory_usage = final_memory - initial_memory
20 | print(f"Memory usage: {memory_usage} MB")
21 | print("--- %s seconds ---" % (time.time() - start_time))
22 | 
23 | print(hyperparams)
24 | 
25 | 
26 | print("Test Precision       : ",round(performance.iloc[-16,0],2),"±",round(performance.iloc[-16,1],2),'%')
27 | print("Test Recall         : ",round(performance.iloc[-15,0],2),"±",round(performance.iloc[-15,1],2),'%')
28 | print("Test F1-Score  : ",round(performance.iloc[-14,0],2),"±",round(performance.iloc[-14,1],2),'%')
29 | print("Test Accuracy  : ",round(performance.iloc[-13,0],2),"±",round(performance.iloc[-13,1],2),'%')
30 | print("Test False Positive (benign packages classified as malicious one) : ",round(performance.iloc[-12,0],0),"±",round(performance.iloc[-12,1],0),'%')
31 | print("Test False Negative (malicious packages classified as benign): ",round(performance.iloc[-11,0],0),"±",round(performance.iloc[-11,1],0),'%')
32 | print("Test True Negative: ",round(performance.iloc[-10,0],0),"±",round(performance.iloc[-10,1],0),'%')
33 | print("Test True Positive: ",round(performance.iloc[-9,0],0),"±",round(performance.iloc[-9,1],0),'%')
34 | 
35 | 
36 | # Now train final model and dump the model
37 | 
38 | X = data.drop(labels=['Package Repository','Malicious','Package Name'],axis=1).values
39 | Y = data['Malicious'].astype('int').values
40 | 
41 | 
42 | classifier_DT =DecisionTreeClassifier(random_state=123,criterion=hyperparams['criterion'],max_depth=hyperparams['max_depth'],max_features=hyperparams['max_features'],min_samples_leaf=hyperparams['min_sample_leaf'],min_samples_split=hyperparams['min_sample_split'])
43 | classifier_DT.fit(X=X, y=Y)
44 | 
45 | 
46 |  
47 | # Get current date and time
48 | dt = str(datetime.datetime.now()).split('.')[0].replace(' ','-').replace(":",'_')
49 | # save the file in the current work directory 
50 | joblib_file='./Py_monolanguage_DT_'+dt+'.pkl'
51 | joblib.dump(classifier_DT,joblib_file)


--------------------------------------------------------------------------------
/scripts/Py_monolanguage_RF_train_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import joblib
 3 | import datetime
 4 | from utilities_functions import *
 5 | import time, psutil
 6 | data=pd.read_csv('./Labelled_Dataset.csv',sep=',')
 7 | 
 8 | data = data[data['Package Repository']=='PyPI']
 9 | 
10 | 
11 | 
12 | # Train and evaluate performances
13 | start_time = time.time()
14 | process = psutil.Process()
15 | initial_memory = process.memory_info().rss / (1024 ** 2)  # Initial memory usage in MB
16 | performance, hyperparams=evaluation_random_forest(data)
17 | final_memory = process.memory_info().rss / (1024 ** 2)    # Final memory usage in MB
18 | memory_usage = final_memory - initial_memory
19 | print(f"Memory usage: {memory_usage} MB")
20 | print("--- %s seconds ---" % (time.time() - start_time))
21 | print(hyperparams)
22 | 
23 | print(hyperparams)
24 | 
25 | 
26 | print("Test Precision       : ",round(performance.iloc[-16,0],2),"±",round(performance.iloc[-16,1],2),'%')
27 | print("Test Recall         : ",round(performance.iloc[-15,0],2),"±",round(performance.iloc[-15,1],2),'%')
28 | print("Test F1-Score  : ",round(performance.iloc[-14,0],2),"±",round(performance.iloc[-14,1],2),'%')
29 | print("Test Accuracy  : ",round(performance.iloc[-13,0],2),"±",round(performance.iloc[-13,1],2),'%')
30 | print("Test False Positive (benign packages classified as malicious one) : ",round(performance.iloc[-12,0],0),"±",round(performance.iloc[-12,1],0),'%')
31 | print("Test False Negative (malicious packages classified as benign): ",round(performance.iloc[-11,0],0),"±",round(performance.iloc[-11,1],0),'%')
32 | print("Test True Negative: ",round(performance.iloc[-10,0],0),"±",round(performance.iloc[-10,1],0),'%')
33 | print("Test True Positive: ",round(performance.iloc[-9,0],0),"±",round(performance.iloc[-9,1],0),'%')
34 | 
35 | 
36 | # Now train final model and dump the model
37 | 
38 | X = data.drop(labels=['Package Repository','Malicious','Package Name'],axis=1).values
39 | Y = data['Malicious'].astype('int').values
40 | 
41 | 
42 | classifier_RF =RandomForestClassifier(random_state=123,criterion=hyperparams['criterion'],n_estimators=hyperparams['n_estimators'],max_depth=hyperparams['max_depth'],max_features=hyperparams['max_features'],min_samples_leaf=hyperparams['min_sample_leaf'],min_samples_split=hyperparams['min_sample_split'],max_samples=hyperparams['max_samples'])
43 | classifier_RF.fit(X=X, y=Y)
44 | 
45 | 
46 |  
47 | # Get current date and time
48 | dt = str(datetime.datetime.now()).split('.')[0].replace(' ','-').replace(":",'_')
49 | # save the file in the current work directory 
50 | joblib_file='./Py_monolanguage_RF_'+dt+'.pkl'
51 | joblib.dump(classifier_RF,joblib_file)


--------------------------------------------------------------------------------
/scripts/Py_monolanguage_XGBoost_train_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import joblib
 3 | import datetime
 4 | from utilities_functions import *
 5 | import psutil
 6 | data=pd.read_csv('./Labelled_Dataset.csv',sep=',')
 7 | 
 8 | data = data[data['Package Repository']=='PyPI']
 9 | 
10 | 
11 | 
12 | # Train and evaluate performances
13 | import time
14 | start_time = time.time()
15 | process = psutil.Process()
16 | initial_memory = process.memory_info().rss / (1024 ** 2)  # Initial memory usage in MB
17 | performance, hyperparams=evaluation_NPM_Pypi_xgb(data)
18 | final_memory = process.memory_info().rss / (1024 ** 2)    # Final memory usage in MB
19 | memory_usage = final_memory - initial_memory
20 | print(f"Memory usage: {memory_usage} MB")
21 | print("--- %s seconds ---" % (time.time() - start_time))
22 | 
23 | print(hyperparams)
24 | 
25 | 
26 | print("Test Precision       : ",round(performance.iloc[-16,0],2),"±",round(performance.iloc[-16,1],2),'%')
27 | print("Test Recall         : ",round(performance.iloc[-15,0],2),"±",round(performance.iloc[-15,1],2),'%')
28 | print("Test F1-Score  : ",round(performance.iloc[-14,0],2),"±",round(performance.iloc[-14,1],2),'%')
29 | print("Test Accuracy  : ",round(performance.iloc[-13,0],2),"±",round(performance.iloc[-13,1],2),'%')
30 | print("Test False Positive (benign packages classified as malicious one) : ",round(performance.iloc[-12,0],0),"±",round(performance.iloc[-12,1],0),'%')
31 | print("Test False Negative (malicious packages classified as benign): ",round(performance.iloc[-11,0],0),"±",round(performance.iloc[-11,1],0),'%')
32 | print("Test True Negative: ",round(performance.iloc[-10,0],0),"±",round(performance.iloc[-10,1],0),'%')
33 | print("Test True Positive: ",round(performance.iloc[-9,0],0),"±",round(performance.iloc[-9,1],0),'%')
34 | 
35 | 
36 | # Now train final model and dump the model
37 | 
38 | X = data.drop(labels=['Package Repository','Malicious','Package Name'],axis=1).values
39 | Y = data['Malicious'].astype('int').values
40 | 
41 | classifier_XGBoost =xgb.XGBClassifier(random_state=123,n_estimators=hyperparams['n_estimators'],max_depth=hyperparams['max_depth'],gamma=hyperparams['gamma'],eta=hyperparams['eta'],colsample_bytree=hyperparams['colsample_bytree'],min_child_weight=hyperparams['min_child_weight'])
42 | classifier_XGBoost.fit(X=X, y=Y)
43 | 
44 | 
45 | 
46 |  
47 | # Get current date and time
48 | dt = str(datetime.datetime.now()).split('.')[0].replace(' ','-').replace(":",'_')
49 | # save the file in the current work directory 
50 | joblib_file='./Py_monolanguage_XGB_'+dt+'.pkl'
51 | joblib.dump(classifier_XGBoost,joblib_file)


--------------------------------------------------------------------------------
/scripts/feature_extraction/README.md:
--------------------------------------------------------------------------------
1 | Sample code to extract features as described in the paper:
2 | Piergiorgio Ladisa, Serena Elisa Ponta, Nicola Ronzoni, Matias Martinez, Olivier Barais:
3 | On the Feasibility of Cross-Language Detection of Malicious Packages in npm and PyPI 
4 | ACSAC 2023


--------------------------------------------------------------------------------
/scripts/feature_extraction/npm-test.py:
--------------------------------------------------------------------------------
1 | from npm_feature_extractor import NPM_Feature_Extractor
2 | 
3 | 
4 | npm_fe = NPM_Feature_Extractor()
5 | input_data = npm_fe.extract_features("npm_samples")
6 | 


--------------------------------------------------------------------------------
/scripts/feature_extraction/npm_feature_extractor.py:
--------------------------------------------------------------------------------
  1 | from typing import final
  2 | import nltk
  3 | import json, os, tarfile
  4 | import utilities_functions
  5 | import pandas as pd
  6 | import numpy as np
  7 | import statistics
  8 | import stat
  9 | 
 10 | from pygments.lexers import JavascriptLexer, JsonLexer
 11 | from pygments.token import Token
 12 | from pathlib import Path
 13 | 
 14 | from functools import reduce
 15 | from urlextract import URLExtract
 16 | 
 17 | 
 18 | 
 19 | class NPM_Feature_Extractor:
 20 | 
 21 | 
 22 |     def __init__(self) :
 23 | 
 24 |         
 25 |         # extensions 
 26 |         # classes: source code, compiled code, packages code, image, video, audio, archive, font, apps, document, data, web, security, database. 
 27 |         self.classes = ['bat',	'bz2',	'c', 'cert',	'conf'	,'cpp'	,'crt',	'css',	'csv',	'deb'	,'erb',	'gemspec',	'gif',	'gz',	'h',	'html',	'ico'	,'ini'	,'jar',	'java',	'jpg',	'js',	'json',	'key'	,'m4v'	,'markdown'	,'md'	,'pdf',	'pem',	'png', 'ps',	'py',	'rb',	'rpm',	'rst','sh'	,'svg',	'toml',	'ttf',	'txt','xml',	'yaml',	'yml',	'eot',	'exe',	 'jpeg',	'properties',	'sql',	'swf',	'tar',	'woff', 'woff2', 'aac','bmp', 'cfg' ,'dcm', 'dll',	'doc',	'flac','flv',	'ipynb', 'm4a', 'mid', 'mkv', 'mp3', 'mp4', 'mpg', 'ogg','otf', 'pickle',	'pkl' ,'psd',	'pxd'	,'pxi', 'pyc',	'pyx',	'r',	'rtf',	'so',	'sqlite'	,'tif',	'tp',	'wav',	'webp'	,'whl',	'xcf',	'xz',	'zip'	,'mov'	,'wasm', 'webm']
 28 |         # stopwords 
 29 |         nltk.download('stopwords')
 30 |         self.stopwords = set(nltk.corpus.stopwords.words('english'))
 31 |        
 32 |         # dangerous token 
 33 |         with open('resources/dangerous_tokens.json', 'r') as file:
 34 |             self.dangerous_token = json.load(file)
 35 | 
 36 | 
 37 |     def extract_features(self, path: str) -> pd.DataFrame:
 38 |         '''
 39 |         Executes the whole pipeline for the extraction of
 40 |         the features from the packages contained in the provided path
 41 |         
 42 |         Input: Path to the set of samples to be classified
 43 |         Output: Dataframe containing extracted data for each package
 44 |         '''
 45 | 
 46 |         self.path_to_scan = path
 47 |         self.unzip_packages()
 48 |         javascript_files_df = self.extract_feature_from_js()
 49 |         
 50 |         packagejson_files_df = self.extract_feature_from_package_json()
 51 |         extensions_files_df =  self.count_package_files_extension()
 52 | 
 53 |         dfs = [javascript_files_df, packagejson_files_df,extensions_files_df]
 54 |         final_df = reduce(lambda  left,right: pd.merge(left,right,on=['Package Name'],
 55 |                                             how='outer'), dfs)
 56 |         final_df = self.extraction(final_df, utilities_functions.gen_language_4,4,utilities_functions.gen_language_4,4)
 57 |         final_df.to_csv("npm_feature_extracted.csv", encoding='utf-8', index=False)
 58 |         return final_df
 59 | 
 60 | 
 61 |     def unzip_packages(self) -> None: 
 62 |         '''
 63 |         Unzips the .tgz file of each NPM package
 64 |         '''
 65 |         
 66 |         for root, dirs, files in os.walk(self.path_to_scan):
 67 |             for file in files:
 68 |                 if file.endswith(".tgz"):
 69 |                     if os.path.getsize(os.path.join(self.path_to_scan,file)) > 0:
 70 |                     
 71 |                         output_dir="".join((self.path_to_scan,"/",file.split(".tgz")[0]))
 72 |                         print(f"[*] Processing {file}")
 73 |                         pkg_file = tarfile.open(os.path.join(self.path_to_scan,file))
 74 |                         pkg_file.extractall(output_dir)
 75 |                         #os.chmod(output_dir, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
 76 | 
 77 |                         pkg_file.close()
 78 | 
 79 |   
 80 | 
 81 |     def extract_feature_from_js(self) -> pd.DataFrame:
 82 | 
 83 |         '''
 84 |         Extract the features from the list of paths containing JS files
 85 |         
 86 |         Input: list of path and string for the specific extension, .js extension, stopwords to be removed.
 87 |         Output: pandas dataframe  
 88 |         
 89 |         '''
 90 | 
 91 |         files_path = utilities_functions.find_files_of_ext(self.path_to_scan, ".js")
 92 | 
 93 |         #initialize the lists 
 94 |         Package=list()
 95 |         version=list()
 96 |         jsfile=list()
 97 |         strings=list()
 98 |         identifiers=list()
 99 |         sospicious_token=list()
100 |         lines=list()
101 |         plus_ratio=list()
102 |         equal_ratio=list()
103 |         square_ratio=list()
104 |         Base64=list()
105 |         ip=list()
106 |         code=list()
107 |         #initialize pandas 
108 |         db=pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
109 |         for i in range(len(files_path)):
110 |             # initialize the list for the puntuactions and operators token 
111 |             operator=[]
112 |             punctuation=[]
113 |             other=[]
114 |             id=[]
115 |             strs=[]
116 |             p=Path(files_path[i])
117 |             # package name 
118 |             package_name=p.parts[2]
119 |             # name of the file
120 |             js=p.parts[-1]
121 |             file = open(files_path[i],"r",encoding="utf8",errors='ignore',newline='\n')
122 |             # convert to string 
123 |             data=file.read()
124 |             # apply the lexer specific for language
125 |             lexer=JavascriptLexer(stripnl=False,ensurenl=False)
126 |             token_source = lexer.get_tokens(data)
127 |             for token in token_source:
128 |                     if token[0] in Token.Operator:
129 |                         operator.append(token[1])
130 |                     elif token[0] in Token.Punctuation:
131 |                         punctuation.append(token[1])
132 |                     elif token[0] in Token.Name:
133 |                         id.append(token[1])
134 |                     elif (token[0] in Token.Literal.String.Single or token[0] in Token.Literal.String.Double or token[0] in Token.Literal.String.Affix or token[0] in Token.Literal.String.Backtick or token[0] in Token.Literal.String.Char or token[0] in Token.Literal.String.Delimiter or token[0] in Token.Literal.String.Doc or token[0] in Token.Literal.String.Escape or token[0] in Token.Literal.String.Heredoc or token[0] in Token.Literal.String.Interpol or token[0] in Token.Literal.String.Other):
135 |                         strs.append(token[1]) 
136 |                     else:
137 |                         other.append(token[1]) 
138 |             with open(files_path[i],"r",encoding="utf8",errors='ignore') as fp:
139 |                 num_lines = sum(1 for line in fp)
140 |                 size = fp.seek(0, os.SEEK_END)
141 |                 size+=1
142 |             
143 |             id = [s.replace("'", '') for s in id]
144 |             id = [s.replace('"', '') for s in id]
145 |             id_=' '.join(id)
146 |             equalities=operator.count('=')/size
147 |             plus=operator.count('+')/size
148 |             Lbrackets=punctuation.count('[')/size
149 |             count_base64=0
150 |             count_IP=0
151 |             byte=0
152 |             for value in range(0,len(strs)):
153 |                 count_base64+=len(utilities_functions.contains_base64(strs[value]))
154 |                 count_IP+=len(utilities_functions.contains_IPAddress(strs[value]))
155 |                 byte+=len(utilities_functions.contains_dangerous_token(strs[value],self.dangerous_token))
156 |             
157 |             strs = [s.replace("'", '') for s in strs]
158 |             strs = [s.replace('"', '') for s in strs]
159 |             string=' '.join(strs).split()
160 |             
161 |             #remove stopwords
162 |             string=list(set(strs)-self.stopwords) 
163 |             string_=' '.join(string)
164 |             
165 |             file.close()
166 |             fp.close()
167 |             #append result to list
168 |             code.append(data)
169 |             Package.append(package_name)
170 |             jsfile.append(js)
171 |             sospicious_token.append(byte)
172 |             lines.append(num_lines)
173 |             plus_ratio.append(plus)
174 |             equal_ratio.append(equalities)
175 |             square_ratio.append(Lbrackets)
176 |             identifiers.append(id_)
177 |             Base64.append(count_base64)
178 |             ip.append(count_IP)
179 |             strings.append(string_)
180 |         # assign to pandas dataframe
181 |         db['Package Name']=Package
182 |         db['.js']=jsfile
183 |         db['sospicious token']=sospicious_token
184 |         db['lines']=lines
185 |         db['equal ratio']=equal_ratio
186 |         db['plus ratio']=plus_ratio
187 |         db['bracket ratio']=square_ratio
188 |         db['identifiers']=identifiers
189 |         db['base64']=Base64
190 |         db['IP']=ip
191 |         db['strings']=strings
192 |         db['code']=code
193 | 
194 |         return self.merge_js_of_same_package(db)
195 | 
196 |    
197 |     def merge_js_of_same_package(self, database: pd.DataFrame) -> pd.DataFrame:
198 |         # function that merge .js file inside the same package 
199 |         p_database= database.groupby(['Package Name'], as_index=False)['code'].agg('\n'.join)
200 |         p_database['Number of words'] = p_database["code"].apply(lambda n: len(n.split()))
201 |         l_database = database.groupby(['Package Name'], as_index=False)['lines'].sum()
202 |         plus_mean= database.groupby(['Package Name'], as_index=False)['plus ratio'].mean()
203 |         plus_mean = plus_mean.rename(columns={"plus ratio": "plus ratio mean"})
204 |         plus_max= database.groupby(['Package Name'], as_index=False)['plus ratio'].max()
205 |         plus_max = plus_max.rename(columns={"plus ratio": "plus ratio max"})    
206 |         plus_std= database.groupby(['Package Name'], as_index=False)['plus ratio'].std()
207 |         plus_std = plus_std.rename(columns={"plus ratio": "plus ratio std"})    
208 |         plus_q3= database.groupby(['Package Name'], as_index=False)['plus ratio'].quantile(0.75)
209 |         plus_q3 = plus_q3.rename(columns={"plus ratio": "plus ratio q3"})    
210 |         eq_mean= database.groupby(['Package Name'], as_index=False)['equal ratio'].mean()
211 |         eq_mean = eq_mean.rename(columns={"equal ratio": "equal ratio mean"})    
212 |         eq_max= database.groupby(['Package Name'], as_index=False)['equal ratio'].max()
213 |         eq_max = eq_max.rename(columns={"equal ratio": "equal ratio max"})
214 |         eq_std= database.groupby(['Package Name'], as_index=False)['equal ratio'].std()
215 |         eq_std = eq_std.rename(columns={"equal ratio": "equal ratio std"})
216 |         eq_q3= database.groupby(['Package Name'], as_index=False)['equal ratio'].quantile(0.75)
217 |         eq_q3 = eq_q3.rename(columns={"equal ratio": "equal ratio q3"})
218 |         bracket_mean= database.groupby(['Package Name'], as_index=False)['bracket ratio'].mean()
219 |         bracket_mean = bracket_mean.rename(columns={"bracket ratio": "bracket ratio mean"})
220 |         bracket_max= database.groupby(['Package Name'], as_index=False)['bracket ratio'].max()
221 |         bracket_max = bracket_max.rename(columns={"bracket ratio": "bracket ratio max"})
222 |         bracket_std= database.groupby(['Package Name'], as_index=False)['bracket ratio'].std()
223 |         bracket_std = bracket_std.rename(columns={"bracket ratio": "bracket ratio std"})
224 |         bracket_q3= database.groupby(['Package Name'], as_index=False)['bracket ratio'].quantile(0.75)
225 |         bracket_q3 = bracket_q3.rename(columns={"bracket ratio": "bracket ratio q3"})
226 |         base = database.groupby(['Package Name'], as_index=False)['base64'].sum()
227 |         ip = database.groupby(['Package Name'], as_index=False)['IP'].sum()
228 |         sospicious = database.groupby(['Package Name'], as_index=False)['sospicious token'].sum()
229 |         string = database.groupby(['Package Name'], as_index=False)['strings'].agg(' '.join)
230 |         identifier = database.groupby(['Package Name'], as_index=False)['identifiers'].agg(' '.join)
231 |         #url = database.groupby(['Package Name'], as_index=False)['url'].sum()
232 |         #p_database['Number of files']=database.groupby(['Package Name', 'version'], as_index=False)['Package Name'].count()['Package Name']
233 |         # merge p_database and l_dataabse
234 |         data = [p_database,l_database,plus_mean,plus_max,plus_std,plus_q3,eq_mean,eq_max,eq_std,eq_q3,bracket_mean,bracket_max,bracket_std,bracket_q3,base,ip,sospicious,string,identifier]
235 |         #merge all DataFrames into one
236 |         final_database = reduce(lambda  left,right: pd.merge(left,right,on=['Package Name'], how='outer'), data)
237 |         final_database.drop('code',axis=1,inplace=True)
238 |         final_database.columns=['Package Name','Number of words','lines','plus ratio mean','plus ratio max','plus ratio std','plus ratio q3','eq ratio mean','eq ratio max','eq ratio std','eq ratio q3','bracket ratio mean','bracket ratio max','bracket ratio std','bracket ratio q3','base64','IP','sospicious token','strings','identifiers']
239 |         return (final_database)
240 | 
241 | 
242 | 
243 |    
244 |     def extract_feature_from_package_json(self) -> pd.DataFrame:
245 |         '''
246 |         input: list of path and string for the specific extension, .json extension, stopwords to be removed.
247 |         output a pandas dataframe 
248 | 
249 |         '''
250 | 
251 |         files_path = utilities_functions.find_files_of_ext(self.path_to_scan, '.json')
252 |         #initialize the lists 
253 |         Package=list()
254 |         version=list()
255 |         jsfile=list()
256 |         strings=list()
257 |         identifiers=list()
258 |         sospicious_token=list()
259 |         lines=list()
260 |         plus_ratio=list()
261 |         equal_ratio=list()
262 |         square_ratio=list()
263 |         Base64=list()
264 |         ip=list()
265 |         code=list()
266 |         #initialize pandas 
267 |         db=pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
268 |         for i in range(len(files_path)):
269 |             if files_path[i].split('/')[-1] == "package.json":
270 |                 # initialize the list for the puntuactions and operators token 
271 |                 operator=[]
272 |                 punctuation=[]
273 |                 other=[]
274 |                 id=[]
275 |                 strs=[]
276 |                 p=Path(files_path[i])
277 |                 # package name 
278 |                 package_name=p.parts[2]
279 |                 # name of the file
280 |                 js=p.parts[-1]
281 |                 file = open(files_path[i],"r",encoding="utf8",errors='ignore',newline='\n')
282 |                 # convert to string 
283 |                 data=file.read()
284 |                 # apply the lexer specific for language
285 |                 lexer=JsonLexer(stripnl=False,ensurenl=False)
286 |                 token_source = lexer.get_tokens(data)
287 |                 for token in token_source:
288 |                         if token[0] in Token.Operator:
289 |                             operator.append(token[1])
290 |                         elif token[0] in Token.Punctuation:
291 |                             punctuation.append(token[1])
292 |                         elif token[0] in Token.Name:
293 |                             id.append(token[1])
294 |                         elif (token[0] in Token.Literal.String.Single or token[0] in Token.Literal.String.Double or token[0] in Token.Literal.String.Affix or token[0] in Token.Literal.String.Backtick or token[0] in Token.Literal.String.Char or token[0] in Token.Literal.String.Delimiter or token[0] in Token.Literal.String.Doc or token[0] in Token.Literal.String.Escape or token[0] in Token.Literal.String.Heredoc or token[0] in Token.Literal.String.Interpol or token[0] in Token.Literal.String.Other):
295 |                             strs.append(token[1]) 
296 |                         else:
297 |                             other.append(token[1]) 
298 |                 with open(files_path[i],"r",encoding="utf8",errors='ignore') as fp:
299 |                     num_lines = sum(1 for line in fp)
300 |                     size = fp.seek(0, os.SEEK_END)
301 |                     size+=1
302 |                 id = [s.replace("'", '') for s in id]
303 |                 id = [s.replace('"', '') for s in id]
304 |                 id_=' '.join(id)
305 |                 equalities=operator.count('=')/size
306 |                 plus=operator.count('+')/size
307 |                 Lbrackets=punctuation.count('[')/size
308 |                 count_base64=0
309 |                 count_IP=0
310 |                 byte=0
311 |                 for value in range(0,len(strs)):
312 |                     count_base64+=len(utilities_functions.contains_base64(strs[value]))
313 |                     count_IP+=len(utilities_functions.contains_IPAddress(strs[value]))
314 |                     byte+= len(utilities_functions.contains_dangerous_token(strs[value],self.dangerous_token))
315 |                 strs = [s.replace("'", '') for s in strs]
316 |                 strs = [s.replace('"', '') for s in strs]
317 |                 string=' '.join(strs).split()
318 |                 #remove stopwords
319 |                 string=list(set(strs)-self.stopwords) 
320 |                 string_=' '.join(string)
321 |                 file.close()
322 |                 fp.close()
323 |                 #append result to list
324 |                 code.append(data)
325 |                 Package.append(package_name)
326 |                 jsfile.append(js)
327 |                 sospicious_token.append(byte)
328 |                 lines.append(num_lines)
329 |                 plus_ratio.append(plus)
330 |                 equal_ratio.append(equalities)
331 |                 square_ratio.append(Lbrackets)
332 |                 identifiers.append(id_)
333 |                 Base64.append(count_base64)
334 |                 ip.append(count_IP)
335 |                 strings.append(string_)
336 |         # assign to pandas dataframe
337 |         db['Package Name']=Package
338 |         db['.json']=jsfile
339 |         db['sospicious token']=sospicious_token
340 |         db['lines']=lines
341 |         db['equal ratio']=equal_ratio
342 |         db['plus ratio']=plus_ratio
343 |         db['bracket ratio']=square_ratio
344 |         db['identifiers']=identifiers
345 |         db['base64']=Base64
346 |         db['IP']=ip
347 |         db['strings']=strings
348 |         db['code']=code
349 |         #db['Number of words'] = db["code"].apply(lambda n: len(n.split()))
350 |         return self.p_db_benign_md(db)
351 | 
352 |     def p_db_benign_md(self,database):
353 |         p_database= database.groupby(['Package Name'], as_index=False)['code'].agg('\n'.join)
354 |         p_database['Number of words'] = p_database["code"].apply(lambda n: len(n.split()))
355 |         l_database = database.groupby(['Package Name'], as_index=False)['lines'].sum()
356 |         base = database.groupby(['Package Name'], as_index=False)['base64'].sum()
357 |         ip = database.groupby(['Package Name'], as_index=False)['IP'].sum()
358 |         sospicious = database.groupby(['Package Name'], as_index=False)['sospicious token'].sum()
359 |         string = database.groupby(['Package Name'], as_index=False)['strings'].agg(' '.join)
360 |         identifier = database.groupby(['Package Name'], as_index=False)['identifiers'].agg(' '.join)
361 |         #url = database.groupby(['Package Name'], as_index=False)['url'].sum()
362 |         #p_database['Number of files']=database.groupby(['Package Name', 'version'], as_index=False)['Package Name'].count()['Package Name']
363 |         # merge p_database and l_dataabse
364 |         data = [p_database,l_database,base,ip,sospicious,string,identifier]
365 |         #merge all DataFrames into one
366 |         final_database = reduce(lambda  left,right: pd.merge(left,right,on=['Package Name'], how='outer'), data)
367 |         final_database.drop('code',axis=1,inplace=True)
368 |         final_database.columns=['Package Name','Number of words','lines','base64','IP','sospicious token','strings','identifiers']
369 |         return (final_database)
370 | 
371 | 
372 |     def count_package_files_extension(self) -> pd.DataFrame:
373 |         #function for extraction number of files with a given extension inside a given package  
374 |         #root: folder that contains the malicious packages
375 |         # classes: list of extension we are looking for
376 |         #function to add a point before the list of extensions
377 | 
378 |         #initialize the lists 
379 |         Package=list()
380 |         extension=list()
381 |         #initialize pandas 
382 |         db=pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
383 |         # for each extension
384 |         for i in range(0,len(self.classes)):
385 |             #extract the extension we are interested in:
386 |             ext='.'+self.classes[i]
387 |             files_path=utilities_functions.find_files_of_ext(self.path_to_scan,ext)
388 |             # foe each file path
389 |             for j in range(len(files_path)):
390 |                 # extract the path
391 |                 p=Path(files_path[j])
392 |                 # package name 
393 |                 package_name=p.parts[2]
394 |                 # version name
395 |                 Package.append(package_name)
396 |                 extension.append(ext)
397 |         db['Package Name']=Package
398 |         db['extension']=extension
399 |         # count frequency of extension, grouped by package name and version
400 |         db=db.groupby(['Package Name', 'extension']).size().unstack(fill_value=0)
401 |         # for each package keep only the last version
402 |         db=db.reset_index()
403 |         db=db.groupby('Package Name').last()
404 | 
405 |         def add_to_beginning(s, start='.'):
406 |             return start + s
407 |         extensions = list(map(add_to_beginning, self.classes))
408 |         #select extensions not founded in the initial list
409 |         f = [c for c in extensions if c not in db.columns]
410 |         #add them to the dataframe
411 |         db = pd.concat([db,pd.DataFrame(columns = f)])
412 |         # fill Nan with 0
413 |         db[f] = db[f].fillna(0)
414 |         # order the column 
415 |         db=db[extensions]
416 |         db.reset_index(inplace=True)
417 |         db =db.rename(columns = {'index':'Package Name'})
418 |         return (db)
419 | 
420 |     # function for shannon entropy extraction, url heterogeneous strings and identifiers 
421 |     def extraction(self,database,alphabetic_string,base_string,alphabetic_id,base_id):
422 |         extractor = URLExtract()
423 |         # repository for NPM 
424 |         database['repository'] = pd.Series([1 for x in range(len(database.index))])
425 |         f = [c for c in database.columns if c not in ['strings_x','identifiers_x','strings_y','identifiers_y']]
426 |         database[f] = database[f].fillna(0)
427 |         # reset index 
428 |         database.index=range(0,len(database))
429 |         #extractor.update() For updating TLDs list 
430 |         # define code to inspect and name of the package  
431 |         source_code_strings=database['strings_x']
432 |         source_code_identifiers=database['identifiers_x']
433 |         metadata_strings=database['strings_y']
434 |         metadata_identifiers=database['identifiers_y']
435 |         name=database['Package Name']
436 |         repository=database['repository']
437 |         check_metadata_strings=metadata_strings.isna()
438 |         check_metadata_identifiers=metadata_identifiers.isna()
439 |         check_source_code_strings=source_code_strings.isna()
440 |         check_source_code_identifiers=source_code_identifiers.isna()
441 |         #initilize lists: one value for each package.
442 |         # source code shannon's features 
443 |         q3_id_sc=[]
444 |         q3_str_sc=[]
445 |         m_id_sc=[]
446 |         m_str_sc=[]
447 |         dev_id_sc=[]
448 |         dev_str_sc=[]
449 |         maximum_id_sc=[]
450 |         maximum_str_sc=[]
451 |         flat_id_sc=[]
452 |         flat_string_sc=[]
453 |         count_url_sc=[]
454 |         obf_id_sc=[]
455 |         obf_string_sc=[]
456 |         # metadata shannon's features 
457 |         q3_id_md=[]
458 |         q3_str_md=[]
459 |         m_id_md=[]
460 |         m_str_md=[]
461 |         dev_id_md=[]
462 |         dev_str_md=[]
463 |         maximum_id_md=[]
464 |         maximum_str_md=[]
465 |         flat_id_md=[]
466 |         flat_string_md=[]
467 |         count_url_md=[]
468 |         obf_id_md=[]
469 |         obf_string_md=[]
470 |         # installation script feature in metadata
471 |         installation=[]
472 |         #db=pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
473 |         for i in range(len(database)): 
474 |                 print(name[i])
475 |                 # select the entry points specific for each language
476 |                 if repository[i]==3:
477 |                         install=['extensions']
478 |                 elif repository[i]==2:
479 |                         install=['install']
480 |                 else: 
481 |                         install=['postinstall','preinstall','install']
482 |                 # source code
483 |                 if check_source_code_strings[i]==False:
484 |                         # string 
485 |                         string_sourcecode=source_code_strings[i]
486 |                         # create a list of strings from a unique string
487 |                         string=string_sourcecode.split()
488 |                 else: 
489 |                         string=[]
490 |                 if check_source_code_identifiers[i]==False:
491 |                         # identifiers
492 |                         identifiers_sourcecode=source_code_identifiers[i]
493 |                         # create a list of identifiers from a unique string
494 |                         identifiers=identifiers_sourcecode.split()
495 |                 else: 
496 |                         identifiers=[]
497 |                 # apply the generalization language
498 |                 generalization_str=[]
499 |                 generalization_id=[]
500 |                 # identifiers 
501 |                 for h in range(0,len(identifiers)):
502 |                         gen=alphabetic_id(identifiers[h])
503 |                         generalization_id.append(gen)
504 |                 obf_sc=utilities_functions.obfuscation(generalization_id,symbols=['u','d','l','s'])
505 |                 # strings
506 |                 url_sc=0
507 |                 for k in range(0,len(string)):
508 |                         try:
509 |                                 url_sc+=len(extractor.find_urls(string[k]))
510 |                         except:
511 |                                 url_sc += len(utilities_functions.contains_URL(string[k]))
512 | 
513 |                         gen=alphabetic_string(string[k])
514 |                         generalization_str.append(gen)  
515 |                 obf_sc_str=utilities_functions.obfuscation(generalization_str,symbols=['u','d','l','s'])
516 |                 # apply shannon entropy   
517 |                 shannon_str=[]
518 |                 shannon_id=[]
519 |                 # identifiers 
520 |                 for w in range(0,len(generalization_id)):
521 |                         shan=utilities_functions.shannon_entropy(generalization_id[w],base_id)
522 |                         shannon_id.append(shan)
523 |                 # strings
524 |                 for y in range(0,len(generalization_str)):
525 |                         shan=utilities_functions.shannon_entropy(generalization_str[y],base_string)
526 |                         shannon_str.append(shan) 
527 |                 # remove shannon values which are equal to 0
528 |                 #shannon_str_no0 = list(filter(lambda x: abs(x) != 0,shannon_str))
529 |                 #shannon_id_no0 = list(filter(lambda x: abs(x) != 0, shannon_id))
530 |                 null_string_sc=len(list(filter(lambda x: abs(x) == 0,shannon_str)))
531 |                 null_id_sc=len(list(filter(lambda x: abs(x) == 0, shannon_id)))
532 |                 #shannon_str=shannon_str_no0
533 |                 #shannon_id=shannon_id_no0
534 |                 if len(shannon_str)>=1:
535 |                         mean_str=statistics.mean(shannon_str)
536 |                         max_str=max(shannon_str)
537 |                         quart_str=np.quantile(shannon_str,0.75)
538 |                 else:
539 |                         mean_str=0
540 |                         max_str=0
541 |                         quart_str=0
542 |                 if len(shannon_str)>1:
543 |                         std_str=np.std(shannon_str)
544 |                 else:
545 |                         std_str=0    
546 |                 if len(shannon_id)>=1:
547 |                         mean_id=statistics.mean(shannon_id)
548 |                         max_id=max(shannon_id)
549 |                         quart_id=np.quantile(shannon_id, 0.75)
550 |                 else:
551 |                         mean_id=0
552 |                         max_id=0
553 |                         quart_id=0
554 |                 if len(shannon_id)>1:
555 |                         std_id=np.std(shannon_id)
556 |                 else:
557 |                         std_id=0
558 |                 m_str_sc.append(mean_str)
559 |                 dev_str_sc.append(std_str)
560 |                 maximum_str_sc.append(max_str)
561 |                 q3_str_sc.append(quart_str)
562 |                 m_id_sc.append(mean_id)
563 |                 dev_id_sc.append(std_id)
564 |                 maximum_id_sc.append(max_id)
565 |                 q3_id_sc.append(quart_id)
566 |                 flat_id_sc.append(null_id_sc)
567 |                 flat_string_sc.append(null_string_sc)
568 |                 count_url_sc.append(url_sc)
569 |                 obf_id_sc.append(obf_sc)
570 |                 obf_string_sc.append(obf_sc_str)
571 |                 #metadata analysis 
572 |                 # string
573 |                 if check_metadata_strings[i]==False: 
574 |                         string_metadata=metadata_strings[i]
575 |                         # create a list of strings from a unique string
576 |                         string_md=string_metadata.split()
577 |                 else: 
578 |                         string_md=[]
579 |                 # identifiers
580 |                 if check_metadata_identifiers[i]==False:
581 |                         identifiers_metadata=metadata_identifiers[i]
582 |                         # create a list of identifiers from a unique string
583 |                         identifiers_md=identifiers_metadata.split()
584 |                         if any(f in identifiers_md for f in install)==True:
585 |                                 install_script=1
586 |                         else:
587 |                                 install_script=0
588 |                 else: 
589 |                         identifiers_md=[]
590 |                         install_script=0
591 |                 # apply the generalization language
592 |                 generalization_str_md=[]
593 |                 generalization_id_md=[]
594 |                 # identifiers 
595 |                 for h in range(0,len(identifiers_md)):
596 |                         gen=alphabetic_id(identifiers_md[h])
597 |                         generalization_id_md.append(gen)
598 |                 obf_md=utilities_functions.obfuscation(generalization_id_md,symbols=['u','d','l','s'])
599 |                 # strings
600 |                 url_md=0
601 |                 for k in range(0,len(string_md)):
602 |                         url_md+=len(extractor.find_urls(string_md[k]))
603 |                         gen=alphabetic_string(string_md[k])
604 |                         generalization_str_md.append(gen)  
605 |                 obf_md_str=utilities_functions.obfuscation(generalization_str_md,symbols=['u','d','l','s'])
606 |                 # apply shannon entropy   
607 |                 shannon_str_md=[]
608 |                 shannon_id_md=[]
609 |                 # identifiers 
610 |                 for w in range(0,len(generalization_id_md)):
611 |                         shan=utilities_functions.shannon_entropy(generalization_id_md[w],base_id)
612 |                         shannon_id_md.append(shan)
613 |                 # strings
614 |                 for y in range(0,len(generalization_str_md)):
615 |                         shan=utilities_functions.shannon_entropy(generalization_str_md[y],base_string)
616 |                         shannon_str_md.append(shan) 
617 |                 # remove shannon values which are equal to 0
618 |                 #shannon_str_md_no0 = list(filter(lambda x: abs(x) != 0,shannon_str_md))
619 |                 #shannon_id_md_no0 = list(filter(lambda x: abs(x) != 0, shannon_id_md))
620 |                 null_id_md=len(list(filter(lambda x: abs(x) == 0, shannon_id_md)))
621 |                 null_string_md=len(list(filter(lambda x: abs(x) == 0,shannon_str_md)))
622 |                 #shannon_str_md=shannon_str_md_no0
623 |                 #shannon_id_md=shannon_id_md_no0
624 |                 if len(shannon_str_md)>=1:
625 |                         mean_str_md=statistics.mean(shannon_str_md)
626 |                         max_str_md=max(shannon_str_md)
627 |                         quart_str_md=np.quantile(shannon_str_md,0.75)
628 |                 else:
629 |                         mean_str_md=0
630 |                         max_str_md=0
631 |                         quart_str_md=0
632 |                 if len(shannon_str_md)>1:
633 |                         std_str_md=np.std(shannon_str_md)
634 |                 else:
635 |                         std_str_md=0    
636 |                 if len(shannon_id_md)>=1:
637 |                         mean_id_md=statistics.mean(shannon_id_md)
638 |                         max_id_md=max(shannon_id_md)
639 |                         quart_id_md=np.quantile(shannon_id_md, 0.75)
640 |                 else:
641 |                         mean_id_md=0
642 |                         max_id_md=0
643 |                         quart_id_md=0
644 |                 if len(shannon_id_md)>1:
645 |                         std_id_md=np.std(shannon_id_md)
646 |                 else:
647 |                         std_id_md=0
648 |                 installation.append(install_script)  
649 |                 m_str_md.append(mean_str_md)
650 |                 dev_str_md.append(std_str_md)
651 |                 maximum_str_md.append(max_str_md)
652 |                 q3_str_md.append(quart_str_md)
653 |                 m_id_md.append(mean_id_md)
654 |                 dev_id_md.append(std_id_md)
655 |                 maximum_id_md.append(max_id_md)
656 |                 q3_id_md.append(quart_id_md)
657 |                 flat_id_md.append(null_id_md)
658 |                 flat_string_md.append(null_string_md)
659 |                 count_url_md.append(url_md)
660 |                 obf_id_md.append(obf_md)
661 |                 obf_string_md.append(obf_md_str)
662 |         # assign columns to the existing dataframe
663 |         pd.options.mode.chained_assignment = None
664 |         import warnings
665 |         warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
666 |         # note that this type of assignment is not efficient
667 |         database['presence of installation script']=installation
668 |         database['shannon mean ID source code']=m_id_sc
669 |         database['shannon std ID source code']=dev_id_sc
670 |         database['shannon max ID source code']=maximum_id_sc
671 |         database['shannon q3 ID source code']=q3_id_sc
672 |         database['shannon mean string source code']=m_str_sc
673 |         database['shannon std string source code']=dev_str_sc
674 |         database['shannon max string source code']=maximum_str_sc
675 |         database['shannon q3 string source code']=q3_str_sc
676 |         database['homogeneous identifiers in source code']=flat_id_sc
677 |         database['homogeneous strings in source code']=flat_string_sc
678 |         database['heteregeneous identifiers in source code']=obf_id_sc
679 |         database['heterogeneous strings in source code']=obf_string_sc
680 |         database['URLs in source code']=count_url_sc
681 |         # metadata features
682 |         database['shannon mean ID metadata']=m_id_md
683 |         database['shannon std ID metadata']=dev_id_md
684 |         database['shannon max ID metadata']=maximum_id_md
685 |         database['shannon q3 ID metadata']=q3_id_md
686 |         database['shannon mean string metadata']=m_str_md
687 |         database['shannon std string metadata']=dev_str_md
688 |         database['shannon max string metadata']=maximum_str_md
689 |         database['shannon q3 string metadata']=q3_str_md
690 |         database['homogeneous identifiers in metadata']=flat_id_md
691 |         database['homogeneous strings in metadata']=flat_string_md
692 |         database['heterogeneous strings in metadata']=obf_string_md
693 |         database['URLs in metadata']=count_url_md
694 |         database['heteregeneous identifiers in metadata']=obf_id_md
695 |         # drop code_x and code_y: raw source code and metadata
696 |         database.drop(['strings_x', 'strings_y','identifiers_x','identifiers_y'], axis=1, inplace=True) 
697 |         # remove duplicates based on some numeric features 
698 |         database.drop_duplicates(subset=['Number of words_x','Number of words_y','lines_x','lines_y','repository','presence of installation script'],keep='first',inplace=True)
699 |         # change the column name of Number of Words_x, Number of Words_y in Number of Words in source code, Number of Words in metadata
700 |         database.rename(columns={'Number of words_x':'Number of Words in source code'},inplace=True)
701 |         database.rename(columns={'Number of words_y':'Number of Words in metadata'},inplace=True)
702 |         database.rename(columns={'lines_x':'Number of lines in source code'},inplace=True)
703 |         database.rename(columns={'lines_y':'Number of lines in metadata'},inplace=True)
704 |         database.rename(columns={'IP_x':'Number of IP adress in source code'},inplace=True)
705 |         database.rename(columns={'base64_x':'Number of base64 chunks in source code'},inplace=True)
706 |         database.rename(columns={'sospicious token_x':'Number of sospicious token in source code'},inplace=True)
707 |         database.rename(columns={'IP_y':'Number of IP adress in metadata'},inplace=True)
708 |         database.rename(columns={'base64_y':'Number of base64 chunks in metadata'},inplace=True)
709 |         database.rename(columns={'sospicious token_y':'Number of sospicious token in metadata'},inplace=True)
710 |         return (database)


--------------------------------------------------------------------------------
/scripts/feature_extraction/pypi-test.py:
--------------------------------------------------------------------------------
1 | from pypi_feature_extractor import PyPI_Feature_Extractor
2 | 
3 | pypi_fe=PyPI_Feature_Extractor()
4 | input_data=pypi_fe.extract_features("py_samples")
5 | 


--------------------------------------------------------------------------------
/scripts/feature_extraction/pypi_feature_extractor.py:
--------------------------------------------------------------------------------
  1 | from typing import final
  2 | import nltk
  3 | import json, os, tarfile
  4 | import utilities_functions
  5 | import pandas as pd
  6 | import numpy as np
  7 | import statistics
  8 | import stat
  9 | 
 10 | from pygments.lexers import PythonLexer
 11 | from pygments.token import Token
 12 | from pathlib import Path
 13 | 
 14 | from functools import reduce
 15 | from urlextract import URLExtract
 16 | 
 17 | class PyPI_Feature_Extractor:
 18 | 
 19 | 
 20 |     def __init__(self) :
 21 | 
 22 |         
 23 |         # extensions 
 24 |         # classes: source code, compiled code, packages code, image, video, audio, archive, font, apps, document, data, web, security, database. 
 25 |         self.classes = ['bat',	'bz2',	'c', 'cert','conf','cpp' ,'crt', 'css',	'csv', 'deb' ,'erb','gemspec', 'gif', 'gz', 'h', 'html', 'ico' ,'ini' ,'jar', 'java', 'jpg', 'js', 'json', 'key' ,'m4v' ,'markdown' ,'md' ,'pdf', 'pem', 'png', 'ps', 'py',	'rb', 'rpm', 'rst','sh'	,'svg',	'toml',	'ttf',	'txt','xml', 'yaml', 'yml', 'eot', 'exe', 'jpeg', 'properties',	'sql',	'swf',	'tar',	'woff', 'woff2', 'aac','bmp', 'cfg' ,'dcm', 'dll', 'doc', 'flac','flv',	'ipynb', 'm4a', 'mid', 'mkv', 'mp3', 'mp4', 'mpg', 'ogg','otf', 'pickle', 'pkl' ,'psd',	'pxd' ,'pxi', 'pyc', 'pyx', 'r', 'rtf',	'so', 'sqlite' ,'tif',	'tp', 'wav', 'webp' ,'whl', 'xcf', 'xz', 'zip' ,'mov' ,'wasm', 'webm']
 26 |         # stopwords 
 27 |         nltk.download('stopwords')
 28 |         self.stopwords = set(nltk.corpus.stopwords.words('english'))
 29 |        
 30 |         # dangerous token 
 31 |         with open('resources/dangerous_tokens.json', 'r') as file:
 32 |             self.dangerous_token = json.load(file)
 33 | 
 34 | 
 35 |     def extract_features(self, path: str) -> pd.DataFrame:
 36 |         '''
 37 |         Executes the whole pipeline for the extraction of
 38 |         the features from the packages contained in the provided path
 39 |         
 40 |         Input: Path to the set of samples to be classified
 41 |         Output: Dataframe containing extracted data for each package
 42 |         '''
 43 | 
 44 |         self.path_to_scan = path
 45 |         self.unzip_packages()
 46 |         py_files_df = self.extract_features_from_py()[0]
 47 |         
 48 |         setup_files_df = self.extract_features_from_py()[1]
 49 |         extensions_files_df =  self.count_package_files_extension()
 50 | 
 51 |         dfs = [py_files_df, setup_files_df,extensions_files_df]
 52 |         final_df = reduce(lambda  left,right: pd.merge(left,right,on=['Package Name'],
 53 |                                             how='outer'), dfs)
 54 |         final_df = self.extraction(final_df, utilities_functions.gen_language_4,4,utilities_functions.gen_language_4,4)
 55 |         final_df.to_csv("pypi_feature_extracted.csv", encoding='utf-8', index=False)
 56 |         return final_df
 57 | 
 58 | 
 59 |     def unzip_packages(self) -> None: 
 60 |         '''
 61 |         Unzips the .tar.gz file of each pyPI package
 62 |         '''
 63 |         
 64 |         for root, dirs, files in os.walk(self.path_to_scan):
 65 |             for file in files:
 66 |                 if file.endswith(".tar.gz"):
 67 |                     if os.path.getsize(os.path.join(self.path_to_scan,file)) > 0:
 68 |                     
 69 |                         output_dir="".join((self.path_to_scan,"/",file.split(".tar.gz")[0]))
 70 |                         print(f"[*] Processing {file}")
 71 |                         pkg_file = tarfile.open(os.path.join(self.path_to_scan,file))
 72 |                         pkg_file.extractall(output_dir)
 73 |                         #os.chmod(output_dir, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
 74 | 
 75 |                         pkg_file.close()
 76 |          
 77 |     def extract_features_from_py(self) -> pd.DataFrame:
 78 | 
 79 |         '''
 80 |         Extract the features from the list of paths containing JS files
 81 |         
 82 |         Input: list of path and string for the specific extension, .js extension, stopwords to be removed.
 83 |         Output: pandas dataframe  
 84 |         
 85 |         '''
 86 | 
 87 |         files_path = utilities_functions.find_files_of_ext(self.path_to_scan, ".py")
 88 |         #initialize the lists 
 89 |         Package=list()
 90 |         version=list()
 91 |         jsfile=list()
 92 |         strings=list()
 93 |         #strings_entire=list()
 94 |         identifiers=list()
 95 |         sospicious_token=list()
 96 |         lines=list()
 97 |         plus_ratio=list()
 98 |         equal_ratio=list()
 99 |         square_ratio=list()
100 |         Base64=list()
101 |         ip=list()
102 |         code=list()
103 |         #initialize pandas 
104 |         db=pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
105 |         for i in range(len(files_path)):
106 |             # initialize the list for the puntuactions and operators token 
107 |             operator=[]
108 |             punctuation=[]
109 |             other=[]
110 |             id=[]
111 |             strs=[]
112 |             p=Path(files_path[i])
113 |             # package name ###### change the path here adapt to the new repo
114 |             
115 |             package_name=p.parts[2]
116 |             # name of the file
117 |             js=p.parts[-1]
118 |             file = open(files_path[i],"r",encoding="utf8",errors='ignore',newline='\n')
119 |             # convert to string 
120 |             data=file.read()
121 |             # apply the lexer specific for language
122 |             lexer=PythonLexer(stripnl=False,ensurenl=False)
123 |             token_source = lexer.get_tokens(data)
124 |             for token in token_source:
125 |                     if token[0] in Token.Operator:
126 |                         operator.append(token[1])
127 |                     elif token[0] in Token.Punctuation:
128 |                         punctuation.append(token[1])
129 |                     elif token[0] in Token.Name:
130 |                         id.append(token[1])
131 |                     elif (token[0] in Token.Literal.String.Single or token[0] in Token.Literal.String.Double or token[0] in Token.Literal.String.Affix or token[0] in Token.Literal.String.Backtick or token[0] in Token.Literal.String.Char or token[0] in Token.Literal.String.Delimiter or token[0] in Token.Literal.String.Doc or token[0] in Token.Literal.String.Escape or token[0] in Token.Literal.String.Heredoc or token[0] in Token.Literal.String.Interpol or token[0] in Token.Literal.String.Other):
132 |                         strs.append(token[1]) 
133 |                     else:
134 |                         other.append(token[1]) 
135 |             with open(files_path[i],"r",encoding="utf8",errors='ignore') as fp:
136 |                 num_lines = sum(1 for line in fp)
137 |                 size = fp.seek(0, os.SEEK_END)
138 |                 size+=1
139 |             id = [s.replace("'", '') for s in id]
140 |             id = [s.replace('"', '') for s in id]
141 |             id_=' '.join(id)
142 |             equalities=operator.count('=')/size
143 |             plus=operator.count('+')/size
144 |             Lbrackets=punctuation.count('[')/size
145 |             count_base64=0
146 |             count_IP=0
147 |             byte=0
148 |             for value in range(0,len(strs)):
149 |                 count_base64+=len(utilities_functions.contains_base64(strs[value]))
150 |                 count_IP+=len(utilities_functions.contains_IPAddress(strs[value]))
151 |                 # contains_dangerous_token --> sospicious list
152 |                 byte+=len(utilities_functions.contains_dangerous_token(strs[value],self.dangerous_token))
153 |             strs = [s.replace("'", '') for s in strs]
154 |             strs = [s.replace('"', '') for s in strs]
155 |             #strings_entire.append(strs)
156 |             string=' '.join(strs).split()
157 |             #remove stopwords
158 |             string=list(set(strs)-self.stopwords) 
159 |             string_=' '.join(string)
160 |             file.close()
161 |             fp.close()
162 |             #append result to list
163 |             code.append(data)
164 |             Package.append(package_name)
165 |             jsfile.append(js)
166 |             sospicious_token.append(byte)
167 |             lines.append(num_lines)
168 |             plus_ratio.append(plus)
169 |             equal_ratio.append(equalities)
170 |             square_ratio.append(Lbrackets)
171 |             identifiers.append(id_)
172 |             Base64.append(count_base64)
173 |             ip.append(count_IP)
174 |             strings.append(string_)
175 |         # assign to pandas dataframe
176 |         
177 |         db['Package Name']=Package
178 |         db['.py']=jsfile
179 |         db['sospicious token']=sospicious_token
180 |         db['lines']=lines
181 |         db['equal ratio']=equal_ratio
182 |         db['plus ratio']=plus_ratio
183 |         db['bracket ratio']=square_ratio
184 |         db['identifiers']=identifiers
185 |         db['base64']=Base64
186 |         db['IP']=ip
187 |         db['strings']=strings
188 |         #db['strings entire']=strings_entire
189 |         db['code']=code
190 |         # returns two dataframe one for all .py files and one only for setup.py file
191 |         setup_db=db[db['.py']=='setup.py']
192 |         db.drop(db.index[db['.py']=='setup.py'],inplace=True)
193 |         return (self.merge_py_of_same_package(db),self.merge_setup_of_same_package(setup_db))
194 | 
195 | 
196 |     def merge_py_of_same_package(self, database: pd.DataFrame) -> pd.DataFrame:
197 |         p_database= database.groupby(['Package Name'], as_index=False)['code'].agg('\n'.join)
198 |         p_database['Number of words'] = p_database["code"].apply(lambda n: len(n.split()))
199 |         l_database = database.groupby(['Package Name'], as_index=False)['lines'].sum()
200 |         plus_mean= database.groupby(['Package Name'], as_index=False)['plus ratio'].mean()
201 |         plus_mean = plus_mean.rename(columns={"plus ratio": "plus ratio mean"})
202 |         plus_max= database.groupby(['Package Name'], as_index=False)['plus ratio'].max()
203 |         plus_max = plus_max.rename(columns={"plus ratio": "plus ratio max"})    
204 |         plus_std= database.groupby(['Package Name'], as_index=False)['plus ratio'].std()
205 |         plus_std = plus_std.rename(columns={"plus ratio": "plus ratio std"})    
206 |         plus_q3= database.groupby(['Package Name'], as_index=False)['plus ratio'].quantile(0.75)
207 |         plus_q3 = plus_q3.rename(columns={"plus ratio": "plus ratio q3"})    
208 |         eq_mean= database.groupby(['Package Name'], as_index=False)['equal ratio'].mean()
209 |         eq_mean = eq_mean.rename(columns={"equal ratio": "equal ratio mean"})    
210 |         eq_max= database.groupby(['Package Name'], as_index=False)['equal ratio'].max()
211 |         eq_max = eq_max.rename(columns={"equal ratio": "equal ratio max"})
212 |         eq_std= database.groupby(['Package Name'], as_index=False)['equal ratio'].std()
213 |         eq_std = eq_std.rename(columns={"equal ratio": "equal ratio std"})
214 |         eq_q3= database.groupby(['Package Name'], as_index=False)['equal ratio'].quantile(0.75)
215 |         eq_q3 = eq_q3.rename(columns={"equal ratio": "equal ratio q3"})
216 |         bracket_mean= database.groupby(['Package Name'], as_index=False)['bracket ratio'].mean()
217 |         bracket_mean = bracket_mean.rename(columns={"bracket ratio": "bracket ratio mean"})
218 |         bracket_max= database.groupby(['Package Name'], as_index=False)['bracket ratio'].max()
219 |         bracket_max = bracket_max.rename(columns={"bracket ratio": "bracket ratio max"})
220 |         bracket_std= database.groupby(['Package Name'], as_index=False)['bracket ratio'].std()
221 |         bracket_std = bracket_std.rename(columns={"bracket ratio": "bracket ratio std"})
222 |         bracket_q3= database.groupby(['Package Name'], as_index=False)['bracket ratio'].quantile(0.75)
223 |         bracket_q3 = bracket_q3.rename(columns={"bracket ratio": "bracket ratio q3"})
224 |         base = database.groupby(['Package Name'], as_index=False)['base64'].sum()
225 |         ip = database.groupby(['Package Name'], as_index=False)['IP'].sum()
226 |         sospicious = database.groupby(['Package Name'], as_index=False)['sospicious token'].sum()
227 |         string = database.groupby(['Package Name'], as_index=False)['strings'].agg(' '.join)
228 |         #string_entire = database.groupby(['Package Name'], as_index=False)['strings entire'].agg(lambda x: list(flatten(x)))
229 |         identifier = database.groupby(['Package Name'], as_index=False)['identifiers'].agg(' '.join)
230 |         #p_database['Number of files']=database.groupby(['Package Name', 'version'], as_index=False)['Package Name'].count()['Package Name']
231 |         # merge p_database and l_dataabse
232 |         data = [p_database,l_database,plus_mean,plus_max,plus_std,plus_q3,eq_mean,eq_max,eq_std,eq_q3,bracket_mean,bracket_max,bracket_std,bracket_q3,base,ip,sospicious,string,identifier]
233 |         #merge all DataFrames into one
234 |         final_database = reduce(lambda  left,right: pd.merge(left,right,on=['Package Name'], how='outer'), data)
235 |         final_database.drop('code',axis=1,inplace=True)
236 |         final_database.columns=['Package Name','Number of words','lines','plus ratio mean','plus ratio max','plus ratio std','plus ratio q3','eq ratio mean','eq ratio max','eq ratio std','eq ratio q3','bracket ratio mean','bracket ratio max','bracket ratio std','bracket ratio q3','base64','IP','sospicious token','strings','identifiers']
237 |         return (final_database)
238 | 
239 |     def merge_setup_of_same_package(self,database):
240 |         p_database= database.groupby(['Package Name'], as_index=False)['code'].agg('\n'.join)
241 |         p_database['Number of words'] = p_database["code"].apply(lambda n: len(n.split()))
242 |         l_database = database.groupby(['Package Name'], as_index=False)['lines'].sum()
243 |         base = database.groupby(['Package Name'], as_index=False)['base64'].sum()
244 |         ip = database.groupby(['Package Name'], as_index=False)['IP'].sum()
245 |         sospicious = database.groupby(['Package Name'], as_index=False)['sospicious token'].sum()
246 |         string = database.groupby(['Package Name'], as_index=False)['strings'].agg(' '.join)
247 |         #string_entire = database.groupby(['Package Name'], as_index=False)['strings entire'].agg(lambda x: list(flatten(x)))
248 |         identifier = database.groupby(['Package Name'], as_index=False)['identifiers'].agg(' '.join)
249 |         #p_database['Number of files']=database.groupby(['Package Name', 'version'], as_index=False)['Package Name'].count()['Package Name']
250 |         # merge p_database and l_dataabse
251 |         data = [p_database,l_database,base,ip,sospicious,string,identifier]
252 |         #merge all DataFrames into one
253 |         final_database = reduce(lambda  left,right: pd.merge(left,right,on=['Package Name'], how='outer'), data)
254 |         final_database.drop('code',axis=1,inplace=True)
255 |         final_database.columns=['Package Name','Number of words','lines','base64','IP','sospicious token','strings','identifiers']
256 |         return (final_database)
257 | 
258 |         #### 
259 |         # classes: list of extension we are looking for  
260 |     def count_package_files_extension(self) -> pd.DataFrame:
261 |         '''
262 |         function for extraction number of files with a given extension inside a given package  
263 |         root: folder that contains the malicious packages
264 |         classes: list of extension we are looking for
265 |         function to add a point before the list of extensions
266 |         '''
267 |         #initialize the lists 
268 |         Package=list()
269 |         extension=list()
270 |         #initialize pandas 
271 |         db=pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
272 |         # for each extension
273 |         for i in range(0,len(self.classes)):
274 |                 #extract the extension we are interested in:
275 |                 ext='.'+self.classes[i]
276 |                 files_path=utilities_functions.find_files_of_ext(self.path_to_scan,ext)
277 |                 # for each file path
278 |                 for j in range(len(files_path)):
279 |                         # extract the path
280 |                         p=Path(files_path[j])
281 |                         # package name ##### change the path here 
282 |                         if "tar.gz" not in p.parts[-1]:
283 |                                 package_name=p.parts[2]
284 |                                 
285 |                                 # version name
286 |                                 Package.append(package_name)
287 |                                 extension.append(ext)
288 |        
289 |         db['Package Name']=Package
290 |         db['extension']=extension
291 |         # count frequency of extension, grouped by package name and version
292 |         db=db.groupby(['Package Name', 'extension']).size().unstack(fill_value=0)
293 |         # for each package keep only the last version
294 |         db=db.groupby('Package Name').last()
295 |         
296 |         def add_to_beginning(s, start='.'):
297 |                 return start + s
298 |         extensions = list(map(add_to_beginning, self.classes))
299 |         #select extensions not founded in the initial list
300 |         f = [c for c in extensions if c not in db.columns]
301 |         #add them to the dataframe
302 |         db = pd.concat([db,pd.DataFrame(columns = f)])
303 |         # fill Nan with 0
304 |         db[f] = db[f].fillna(0)
305 |         # order the column 
306 |         db=db[extensions]
307 |         db.reset_index(inplace=True)
308 |         db =db.rename(columns = {'index':'Package Name'})
309 |         return (db)
310 | 
311 |     def extraction(self, database,alphabetic_string,base_string,alphabetic_id,base_id):
312 |         extractor = URLExtract()
313 |         # repository for Pypi 
314 |         database['repository'] = pd.Series([2 for x in range(len(database.index))])
315 |         f = [c for c in database.columns if c not in ['strings_x','identifiers_x','strings_y','identifiers_y']]
316 |         database[f] = database[f].fillna(0)
317 |         # reset index 
318 |         database.index=range(0,len(database))
319 |         #extractor.update() For updating TLDs list 
320 |         # define code to inspect and name of the package  
321 |         source_code_strings=database['strings_x']
322 |         source_code_identifiers=database['identifiers_x']
323 |         metadata_strings=database['strings_y']
324 |         metadata_identifiers=database['identifiers_y']
325 |         name=database['Package Name']
326 |         repository=database['repository']
327 |         check_metadata_strings=metadata_strings.isna()
328 |         check_metadata_identifiers=metadata_identifiers.isna()
329 |         check_source_code_strings=source_code_strings.isna()
330 |         check_source_code_identifiers=source_code_identifiers.isna()
331 |         #initilize lists: one value for each package.
332 |         # source code shannon's features 
333 |         q3_id_sc=[]
334 |         q3_str_sc=[]
335 |         m_id_sc=[]
336 |         m_str_sc=[]
337 |         dev_id_sc=[]
338 |         dev_str_sc=[]
339 |         maximum_id_sc=[]
340 |         maximum_str_sc=[]
341 |         flat_id_sc=[]
342 |         flat_string_sc=[]
343 |         count_url_sc=[]
344 |         obf_id_sc=[]
345 |         obf_string_sc=[]
346 |         # metadata shannon's features 
347 |         q3_id_md=[]
348 |         q3_str_md=[]
349 |         m_id_md=[]
350 |         m_str_md=[]
351 |         dev_id_md=[]
352 |         dev_str_md=[]
353 |         maximum_id_md=[]
354 |         maximum_str_md=[]
355 |         flat_id_md=[]
356 |         flat_string_md=[]
357 |         count_url_md=[]
358 |         obf_id_md=[]
359 |         obf_string_md=[]
360 |         # installation script feature in metadata
361 |         installation=[]
362 |         #db=pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
363 |         for i in range(len(database)): 
364 |                 
365 |                 # select the entry points specific for each language
366 |                 if repository[i]==3:
367 |                         install=['extensions']
368 |                 elif repository[i]==2:
369 |                         install=['install']
370 |                 else: 
371 |                         install=['postinstall','preinstall','install']
372 |                 # source code
373 |                 if check_source_code_strings[i]==False:
374 |                         # string 
375 |                         string_sourcecode=source_code_strings[i]
376 |                         # create a list of strings from a unique string
377 |                         string=string_sourcecode.split()
378 |                 else: 
379 |                         string=[]
380 |                 if check_source_code_identifiers[i]==False:
381 |                         # identifiers
382 |                         identifiers_sourcecode=source_code_identifiers[i]
383 |                         # create a list of identifiers from a unique string
384 |                         identifiers=identifiers_sourcecode.split()
385 |                 else: 
386 |                         identifiers=[]
387 |                 # apply the generalization language
388 |                 generalization_str=[]
389 |                 generalization_id=[]
390 |                 # identifiers 
391 |                 for h in range(0,len(identifiers)):
392 |                         gen=alphabetic_id(identifiers[h])
393 |                         generalization_id.append(gen)
394 |                 obf_sc=utilities_functions.obfuscation(generalization_id,symbols=['u','d','l','s'])
395 |                 # strings
396 |                 url_sc=0
397 |                 for k in range(0,len(string)):
398 |                         try:
399 |                                 url_sc+=len(extractor.find_urls(string[k]))
400 |                         except:
401 |                                 url_sc += len(utilities_functions.contains_URL(string[k]))
402 |                         gen=alphabetic_string(string[k])
403 |                         generalization_str.append(gen)  
404 |                 obf_sc_str=utilities_functions.obfuscation(generalization_str,symbols=['u','d','l','s'])
405 |                 # apply shannon entropy   
406 |                 shannon_str=[]
407 |                 shannon_id=[]
408 |                 # identifiers 
409 |                 for w in range(0,len(generalization_id)):
410 |                         shan=utilities_functions.shannon_entropy(generalization_id[w],base_id)
411 |                         shannon_id.append(shan)
412 |                 # strings
413 |                 for y in range(0,len(generalization_str)):
414 |                         shan=utilities_functions.shannon_entropy(generalization_str[y],base_string)
415 |                         shannon_str.append(shan) 
416 |                 # remove shannon values which are equal to 0
417 |                 #shannon_str_no0 = list(filter(lambda x: abs(x) != 0,shannon_str))
418 |                 #shannon_id_no0 = list(filter(lambda x: abs(x) != 0, shannon_id))
419 |                 null_string_sc=len(list(filter(lambda x: abs(x) == 0,shannon_str)))
420 |                 null_id_sc=len(list(filter(lambda x: abs(x) == 0, shannon_id)))
421 |                 #shannon_str=shannon_str_no0
422 |                 #shannon_id=shannon_id_no0
423 |                 if len(shannon_str)>=1:
424 |                         mean_str=statistics.mean(shannon_str)
425 |                         max_str=max(shannon_str)
426 |                         quart_str=np.quantile(shannon_str,0.75)
427 |                 else:
428 |                         mean_str=0
429 |                         max_str=0
430 |                         quart_str=0
431 |                 if len(shannon_str)>1:
432 |                         std_str=np.std(shannon_str)
433 |                 else:
434 |                         std_str=0    
435 |                 if len(shannon_id)>=1:
436 |                         mean_id=statistics.mean(shannon_id)
437 |                         max_id=max(shannon_id)
438 |                         quart_id=np.quantile(shannon_id, 0.75)
439 |                 else:
440 |                         mean_id=0
441 |                         max_id=0
442 |                         quart_id=0
443 |                 if len(shannon_id)>1:
444 |                         std_id=np.std(shannon_id)
445 |                 else:
446 |                         std_id=0
447 |                 m_str_sc.append(mean_str)
448 |                 dev_str_sc.append(std_str)
449 |                 maximum_str_sc.append(max_str)
450 |                 q3_str_sc.append(quart_str)
451 |                 m_id_sc.append(mean_id)
452 |                 dev_id_sc.append(std_id)
453 |                 maximum_id_sc.append(max_id)
454 |                 q3_id_sc.append(quart_id)
455 |                 flat_id_sc.append(null_id_sc)
456 |                 flat_string_sc.append(null_string_sc)
457 |                 count_url_sc.append(url_sc)
458 |                 obf_id_sc.append(obf_sc)
459 |                 obf_string_sc.append(obf_sc_str)
460 |                 #metadata analysis 
461 |                 # string
462 |                 if check_metadata_strings[i]==False: 
463 |                         string_metadata=metadata_strings[i]
464 |                         # create a list of strings from a unique string
465 |                         string_md=string_metadata.split()
466 |                 else: 
467 |                         string_md=[]
468 |                 # identifiers
469 |                 if check_metadata_identifiers[i]==False:
470 |                         identifiers_metadata=metadata_identifiers[i]
471 |                         # create a list of identifiers from a unique string
472 |                         identifiers_md=identifiers_metadata.split()
473 |                         if any(f in identifiers_md for f in install)==True:
474 |                                 install_script=1
475 |                         else:
476 |                                 install_script=0
477 |                 else: 
478 |                         identifiers_md=[]
479 |                         install_script=0
480 |                 # apply the generalization language
481 |                 generalization_str_md=[]
482 |                 generalization_id_md=[]
483 |                 # identifiers 
484 |                 for h in range(0,len(identifiers_md)):
485 |                         gen=alphabetic_id(identifiers_md[h])
486 |                         generalization_id_md.append(gen)
487 |                 obf_md=utilities_functions.obfuscation(generalization_id_md,symbols=['u','d','l','s'])
488 |                 # strings
489 |                 url_md=0
490 |                 for k in range(0,len(string_md)):
491 |                         try:
492 |                                 url_sc+=len(extractor.find_urls(string_md[k]))
493 |                         except:
494 |                                 url_sc += len(utilities_functions.contains_URL(string_md[k]))
495 |                         gen=alphabetic_string(string_md[k])
496 |                         generalization_str_md.append(gen)  
497 |                 obf_md_str=utilities_functions.obfuscation(generalization_str_md,symbols=['u','d','l','s'])
498 |                 # apply shannon entropy   
499 |                 shannon_str_md=[]
500 |                 shannon_id_md=[]
501 |                 # identifiers 
502 |                 for w in range(0,len(generalization_id_md)):
503 |                         shan=utilities_functions.shannon_entropy(generalization_id_md[w],base_id)
504 |                         shannon_id_md.append(shan)
505 |                 # strings
506 |                 for y in range(0,len(generalization_str_md)):
507 |                         shan=utilities_functions.shannon_entropy(generalization_str_md[y],base_string)
508 |                         shannon_str_md.append(shan) 
509 |                 # remove shannon values which are equal to 0
510 |                 #shannon_str_md_no0 = list(filter(lambda x: abs(x) != 0,shannon_str_md))
511 |                 #shannon_id_md_no0 = list(filter(lambda x: abs(x) != 0, shannon_id_md))
512 |                 null_id_md=len(list(filter(lambda x: abs(x) == 0, shannon_id_md)))
513 |                 null_string_md=len(list(filter(lambda x: abs(x) == 0,shannon_str_md)))
514 |                 #shannon_str_md=shannon_str_md_no0
515 |                 #shannon_id_md=shannon_id_md_no0
516 |                 if len(shannon_str_md)>=1:
517 |                         mean_str_md=statistics.mean(shannon_str_md)
518 |                         max_str_md=max(shannon_str_md)
519 |                         quart_str_md=np.quantile(shannon_str_md,0.75)
520 |                 else:
521 |                         mean_str_md=0
522 |                         max_str_md=0
523 |                         quart_str_md=0
524 |                 if len(shannon_str_md)>1:
525 |                         std_str_md=np.std(shannon_str_md)
526 |                 else:
527 |                         std_str_md=0    
528 |                 if len(shannon_id_md)>=1:
529 |                         mean_id_md=statistics.mean(shannon_id_md)
530 |                         max_id_md=max(shannon_id_md)
531 |                         quart_id_md=np.quantile(shannon_id_md, 0.75)
532 |                 else:
533 |                         mean_id_md=0
534 |                         max_id_md=0
535 |                         quart_id_md=0
536 |                 if len(shannon_id_md)>1:
537 |                         std_id_md=np.std(shannon_id_md)
538 |                 else:
539 |                         std_id_md=0
540 |                 installation.append(install_script)  
541 |                 m_str_md.append(mean_str_md)
542 |                 dev_str_md.append(std_str_md)
543 |                 maximum_str_md.append(max_str_md)
544 |                 q3_str_md.append(quart_str_md)
545 |                 m_id_md.append(mean_id_md)
546 |                 dev_id_md.append(std_id_md)
547 |                 maximum_id_md.append(max_id_md)
548 |                 q3_id_md.append(quart_id_md)
549 |                 flat_id_md.append(null_id_md)
550 |                 flat_string_md.append(null_string_md)
551 |                 count_url_md.append(url_md)
552 |                 obf_id_md.append(obf_md)
553 |                 obf_string_md.append(obf_md_str)
554 |         # assign columns to the existing dataframe
555 |         pd.options.mode.chained_assignment = None
556 |         import warnings
557 |         warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
558 |         # note that this type of assignment is not efficient
559 |         database['presence of installation script']=installation
560 |         database['shannon mean ID source code']=m_id_sc
561 |         database['shannon std ID source code']=dev_id_sc
562 |         database['shannon max ID source code']=maximum_id_sc
563 |         database['shannon q3 ID source code']=q3_id_sc
564 |         database['shannon mean string source code']=m_str_sc
565 |         database['shannon std string source code']=dev_str_sc
566 |         database['shannon max string source code']=maximum_str_sc
567 |         database['shannon q3 string source code']=q3_str_sc
568 |         database['homogeneous identifiers in source code']=flat_id_sc
569 |         database['homogeneous strings in source code']=flat_string_sc
570 |         database['heteregeneous identifiers in source code']=obf_id_sc
571 |         database['heterogeneous strings in source code']=obf_string_sc
572 |         database['URLs in source code']=count_url_sc
573 |         # metadata features
574 |         database['shannon mean ID metadata']=m_id_md
575 |         database['shannon std ID metadata']=dev_id_md
576 |         database['shannon max ID metadata']=maximum_id_md
577 |         database['shannon q3 ID metadata']=q3_id_md
578 |         database['shannon mean string metadata']=m_str_md
579 |         database['shannon std string metadata']=dev_str_md
580 |         database['shannon max string metadata']=maximum_str_md
581 |         database['shannon q3 string metadata']=q3_str_md
582 |         database['homogeneous identifiers in metadata']=flat_id_md
583 |         database['homogeneous strings in metadata']=flat_string_md
584 |         database['heterogeneous strings in metadata']=obf_string_md
585 |         database['URLs in metadata']=count_url_md
586 |         database['heteregeneous identifiers in metadata']=obf_id_md
587 |         # drop code_x and code_y: raw source code and metadata
588 |         database.drop(['strings_x', 'strings_y','identifiers_x','identifiers_y'], axis=1, inplace=True) 
589 |         # remove duplicates based on some numeric features 
590 |         database.drop_duplicates(subset=['Number of words_x','Number of words_y','lines_x','lines_y','repository','presence of installation script'],keep='first',inplace=True)
591 |         # change the column name of Number of Words_x, Number of Words_y in Number of Words in source code, Number of Words in metadata
592 |         database.rename(columns={'Number of words_x':'Number of Words in source code'},inplace=True)
593 |         database.rename(columns={'Number of words_y':'Number of Words in metadata'},inplace=True)
594 |         database.rename(columns={'lines_x':'Number of lines in source code'},inplace=True)
595 |         database.rename(columns={'lines_y':'Number of lines in metadata'},inplace=True)
596 |         database.rename(columns={'IP_x':'Number of IP adress in source code'},inplace=True)
597 |         database.rename(columns={'base64_x':'Number of base64 chunks in source code'},inplace=True)
598 |         database.rename(columns={'sospicious token_x':'Number of sospicious token in source code'},inplace=True)
599 |         database.rename(columns={'IP_y':'Number of IP adress in metadata'},inplace=True)
600 |         database.rename(columns={'base64_y':'Number of base64 chunks in metadata'},inplace=True)
601 |         database.rename(columns={'sospicious token_y':'Number of sospicious token in metadata'},inplace=True)
602 |         return (database)
603 |                 
604 | 


--------------------------------------------------------------------------------
/scripts/feature_extraction/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk
2 | pandas
3 | pygments
4 | urlextract


--------------------------------------------------------------------------------
/scripts/feature_extraction/utilities_functions.py:
--------------------------------------------------------------------------------
  1 | import base64, os, re, socket, json
  2 | from pathlib import Path
  3 | from collections import Counter
  4 | import math
  5 | 
  6 | b64regex = re.compile(r'[a-zA-Z0-9=/\+]*')
  7 | ipaddr_regex = re.compile(r'[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}')
  8 | urls_regex = re.compile(r"""((?:(?:http|http|ssh|ftp|sftp|ws|wss|dns|file|git|jni|imap|ldap|ldaps|nfs|smb|smbs|telnet|udp|vnc)?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|org|uk)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|uk|ac)\b/?(?!@)))""")
  9 | 
 10 | 
 11 | 
 12 | 
 13 | def is_base64(sb):
 14 |     try:
 15 |         if isinstance(sb, str):
 16 |             # If there's any unicode here, an exception will be thrown and the function will return false
 17 |             sb_bytes = bytes(sb, 'ascii')
 18 |         elif isinstance(sb, bytes):
 19 |             sb_bytes = sb
 20 |         else:
 21 |             raise ValueError("Argument must be string or bytes")
 22 |         decoded_string = base64.b64decode(sb_bytes).decode("utf-8")
 23 |         decoded_string = ' '.join(decoded_string.split())
 24 |         if (decoded_string.isprintable()):
 25 |             return base64.b64encode(base64.b64decode(sb_bytes)) == sb_bytes
 26 |         else:
 27 |             return False
 28 |     except Exception:
 29 |         return False
 30 | 
 31 | def is_IPAddress(s):
 32 |     try:
 33 |         socket.inet_aton(s.split(":")[0])
 34 |         return True
 35 |     except socket.error:
 36 |         return False
 37 | 
 38 | 
 39 | def contains_base64(string):
 40 |     list_of_words = list(dict.fromkeys(b64regex.findall(string)))
 41 | 
 42 |     base64_strings = []
 43 |     for w in list_of_words:
 44 |         if len(w) > 1:
 45 |             if is_base64(w):
 46 |                 base64_strings.append(w)
 47 |     return base64_strings
 48 | 
 49 | 
 50 | def contains_IPAddress(string):
 51 |     list_of_words = list(dict.fromkeys(ipaddr_regex.findall(string)))
 52 |     IPAddress_strings = []
 53 |     for w in list_of_words:
 54 |         if (len(w) > 6):
 55 |             if is_IPAddress(w):
 56 |                 IPAddress_strings.append(w)
 57 |     return IPAddress_strings
 58 | 
 59 | def contains_URL(string):
 60 |  
 61 |     list_of_matches = urls_regex.findall(string)
 62 |     list_of_candidates = []
 63 |     for m in list_of_matches:
 64 |         list_of_candidates.append(max(list(m),key=len))
 65 |     return list_of_candidates
 66 | 
 67 | def contains_dangerous_token(string,dangerous_tok):
 68 |     findings_list = [] 
 69 |     for susp in dangerous_tok:          
 70 |         if susp in string:
 71 |                 findings_list.append(susp)
 72 |     return findings_list
 73 | 
 74 | 
 75 | def find_files_of_ext(root, ext):
 76 |     # find the path for a given extension 
 77 |     return [str(Path(dir, file_)) for dir, subdir, files in os.walk(root) for file_ in files if Path(file_).suffix == ext] 
 78 | 
 79 | # shannon entropy function 
 80 | def shannon_entropy(data, base=2):
 81 |     entropy = 0.0
 82 |     if len(data) > 0:
 83 |         cnt = Counter(data)
 84 |         length = len(data)
 85 |         for count in cnt.values():
 86 |                 entropy += (count / length) * math.log(count / length, base)
 87 |         entropy = entropy * -1.0
 88 |     return (entropy)
 89 | 
 90 | # input list of identifiers transformed by the generalization language 
 91 | def obfuscation(list_id,symbols=['u','d','l','s']):
 92 | 
 93 |     unique_symbols_id=[]
 94 |     # get unique symbols from each identifiers
 95 |     for i in range(0,len(list_id)):
 96 |         unique_symbols_id.append("".join(set(list_id[i])))
 97 |     # initialize the count for obfuscation:
 98 |     obs=0
 99 |     for i in range(0,len(unique_symbols_id)):
100 |         # Upper case, digit, lower case, symbol
101 |         if (check(unique_symbols_id[i],symbols))==['True', 'True', 'True', 'True']:
102 |             obs+=1
103 |         # upper case, digit, symbol
104 |         if (check(unique_symbols_id[i],symbols))==['True', 'True', 'False', 'True']:
105 |             obs+=1
106 |         # digit, lower case, symbol
107 |         if (check(unique_symbols_id[i],symbols))==['False', 'True', 'True', 'True']:
108 |             obs+=1
109 |         # digit, symbol
110 |         if (check(unique_symbols_id[i],symbols))==['False', 'True', 'False', 'True']:
111 |             obs+=1
112 |     
113 |         
114 |     return(obs)
115 | 
116 | # function to check the presence of given symbols in identifiers: symbols of the generalization language with 4 characters
117 | 
118 | def check(s, arr):
119 |     result = []
120 |     for i in arr:
121 |     
122 |         # for every character in char array
123 |         # if it is present in string return true else false
124 |         if i in s:
125 |             result.append("True")
126 |         else:
127 |             result.append("False")
128 |     return result
129 | 
130 | def gen_language_4(value):
131 |     pattern = ''
132 |     value = list(str(value))
133 |     for c in value:
134 |         if c.isnumeric():
135 |             pattern += 'd'
136 |         elif c.isupper():
137 |             pattern += 'u'
138 |         elif c.islower():
139 |             pattern +='l'
140 |         else:
141 |             pattern += 's'
142 |     
143 |     return pattern
144 | 
145 | # generalization languages
146 | def gen_language_3(value):
147 |     pattern = ''
148 |     value = list(str(value))
149 |     for c in value:
150 |         if c.isnumeric():
151 |             pattern += 'd'
152 |         elif c.isalpha():
153 |             pattern += 'l'
154 |         else:
155 |             pattern += 's'
156 |     
157 |     return (pattern)
158 | 
159 | 
160 | def gen_language_8(value):
161 |     pattern = ''
162 |     value = list(str(value))
163 |     for c in value:
164 |         if c.isnumeric():
165 |             pattern += 'd'
166 |         elif c.isupper():
167 |             pattern += 'u'
168 |         elif c.islower():
169 |             pattern +='l'
170 |         elif c=='.':
171 |             pattern +='p'
172 |         elif c=='/':
173 |             pattern +='h'
174 |         elif c=='-':
175 |             pattern +='a' 
176 |         elif c=='|' or c=='%' or c=='$'or c=='~'or c=='?':
177 |             pattern +='i'
178 |         else:
179 |             pattern += 's'
180 |     
181 |     return (pattern)
182 | 
183 | def gen_language_16(value):
184 |     pattern = ''
185 |     value = list(str(value))
186 |     for c in value:
187 |         if c.isnumeric():
188 |             pattern += 'd'
189 |         elif c.isupper():
190 |             pattern += 'u'
191 |         elif c.islower():
192 |             pattern +='l'
193 |         elif c=='.':
194 |             pattern +='p'
195 |         elif c=='/':
196 |             pattern +='h'
197 |         elif c=='-':
198 |             pattern +='a'
199 |         elif c=='%':
200 |             pattern +='p'
201 |         elif c=='|':
202 |             pattern +='i'
203 |         elif c=='=':
204 |             pattern +='e'
205 |         elif c==':':
206 |             pattern +='c'
207 |         elif c=='$':
208 |             pattern +='m'
209 |         elif c=='>':
210 |             pattern +='g'
211 |         elif c=='<':
212 |             pattern +='o'
213 |         elif c=='~':
214 |             pattern +='t'
215 |         elif c=='?':
216 |             pattern +='q'
217 |         else:
218 |             pattern += 's'
219 |     
220 |     return (pattern)
221 | 
222 |     


--------------------------------------------------------------------------------
/scripts/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.2.3
2 | numpy==1.22.0
3 | xgboost==1.6.1
4 | scikit-learn==1.2.2
5 | bayesian-optimization==1.4.3
6 | joblib==1.2.0


--------------------------------------------------------------------------------
/scripts/utilities_functions.py:
--------------------------------------------------------------------------------
  1 | # scipy version 1.7.0 required 
  2 | 
  3 | import pandas as pd
  4 | 
  5 | import numpy as np
  6 | import xgboost as xgb
  7 | from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
  8 | from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score
  9 | from sklearn.metrics import confusion_matrix
 10 | from sklearn.tree import DecisionTreeClassifier
 11 | from sklearn.ensemble import RandomForestClassifier
 12 | from bayes_opt import BayesianOptimization
 13 | 
 14 | 
 15 | 
 16 | 
 17 | def split_training_testing(database,test_size,random):
 18 |     f = [c for c in database.columns if c not in ['Malicious','Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','Package Repository','Package Name']]#,'Number of lines in metadata','Number of Words in metadata'
 19 |     
 20 |     #regressor
 21 |     X  = database[f].iloc[:,:].values
 22 |    
 23 |     #target info 
 24 |     y  = database.loc[:,['Malicious','Package Repository','Package Name']].values
 25 |  
 26 |     
 27 |     #stratification based  benign/malicious and public repository origin ratio
 28 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, shuffle=True, stratify=y[:,0:2], random_state=random) # 0:2 stratify by ratio between malicious and benign and origin repo ratio 
 29 | 
 30 |     return(X_train, X_test, y_train, y_test,f)
 31 | 
 32 | 
 33 | '''
 34 | XGBOOST
 35 | 
 36 | '''
 37 | 
 38 | def evaluation_NPM_Pypi_xgb(database): 
 39 |     database = database.loc[:, ~database.columns.str.contains('^Unnamed')]
 40 |     database['Package Repository'] = np.where(database['Package Repository'] == "NPM", 1, 2)
 41 | 
 42 |     # Dict to set set of params and related precision
 43 |     hyperpar_list = []
 44 |     # define a list for the evaluation metrics 
 45 |     evaluation=['precision','recall','f1', 'accuracy','false positive','false negative','true negative','true positive','precision_npm','recall_npm','f1_npm', 'acc_npm','precision_pypi','recall_pypi','f1_pypi','acc_pypi']
 46 |     # define a list for the features 
 47 |     f = [c for c in database.columns if c not in ['Malicious','Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','Package Repository','Package Name']]#,'Number of lines in metadata','Number of Words in metadata'
 48 |     # initialize the dataframe
 49 |     eval = pd.DataFrame(data=None, index=[y for x in [f,evaluation] for y in x])
 50 |     random_split=[123,333,567,999,876,371,459,111,902,724]
 51 |     for i in range(0,len(random_split)):
 52 |         #split
 53 |         split_=split_training_testing(database[database['Package Repository']!=3], test_size=0.2, random=random_split[i])
 54 |         # optimization of the hyperparameters with cross-validation in the train set 
 55 |         train_rf_=grid_xgb_py(split_[0], split_[2])
 56 |         # fit the model with the best hyperparameters 
 57 |         classifier =xgb.XGBClassifier(random_state=123,n_estimators=train_rf_['n_estimators'],max_depth=train_rf_['max_depth'],gamma=train_rf_['gamma'],eta=train_rf_['eta'],colsample_bytree=train_rf_['colsample_bytree'],min_child_weight=train_rf_['min_child_weight'])
 58 |         classifier.fit(split_[0], split_[2][:,0].astype('int'))
 59 |         #predict on test data 
 60 |         y_pred_test_=classifier.predict(split_[1])
 61 |         
 62 |         hyperpar_list.append({'precision':round(precision_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2),'hyperparams':train_rf_})
 63 |         # array for features,precison,recall,f1
 64 |         precision=np.append(classifier.feature_importances_,round(precision_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2))
 65 |         recall=np.append(precision,round(recall_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2))
 66 |         f1=np.append(recall,round(f1_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2))
 67 |         acc=np.append(f1,round(accuracy_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2))
 68 |         # false positive, false negative, true negative, true positive 
 69 |         tn,fp,fn,tp = confusion_matrix(split_[3][:,0].astype('int'),y_pred_test_).ravel()
 70 |         false_positive=np.append(acc,fp)
 71 |         false_negative=np.append(false_positive,fn)
 72 |         true_negative=np.append(false_negative,tn)
 73 |         true_positive=np.append(true_negative,tp)
 74 |         # evaluation group by repository 
 75 |         repository=np.concatenate((split_[3][:,0].astype('int').reshape(len(split_[3][:,0].astype('int')),1), y_pred_test_.reshape(len(y_pred_test_),1),split_[3][:,1].astype('int').reshape(len(split_[3][:,1].astype('int')),1)), axis=1, out=None)
 76 |         npm=repository[repository[:,2] == 1]
 77 |         pypi=repository[repository[:,2] == 2]
 78 |         # precision, recall, f1 for npm 
 79 |         precision_npm=np.append(true_positive,round(precision_score(npm[:,0], npm[:,1])*100,2))
 80 |         recall_npm=np.append(precision_npm,round(recall_score(npm[:,0], npm[:,1])*100,2))
 81 |         f1_npm=np.append(recall_npm,round(f1_score(npm[:,0], npm[:,1])*100,2))
 82 |         acc_npm=np.append(f1_npm,round(accuracy_score(npm[:,0], npm[:,1])*100,2))
 83 |         # precision, recall, f1 for pypi
 84 |         precision_pypi=np.append(acc_npm,round(precision_score(pypi[:,0], pypi[:,1])*100,2))
 85 |         recall_pypi=np.append(precision_pypi,round(recall_score(pypi[:,0], pypi[:,1])*100,2))
 86 |         f1_pypi=np.append(recall_pypi,round(f1_score(pypi[:,0], pypi[:,1])*100,2))
 87 |         metrics=np.append(f1_pypi,round(accuracy_score(pypi[:,0], pypi[:,1])*100,2))
 88 |         eval[i]=metrics.tolist()
 89 | 
 90 |         
 91 |     # replace 0, with NaN 
 92 |     eval=eval.replace(0,np.nan)
 93 |     mean=eval.mean(axis=1)
 94 |     std=eval.std(axis=1)
 95 |     result=pd.concat([mean, std], axis=1)
 96 |     return(result, get_best_hyperparams(hyperpar_list))
 97 | 
 98 | 
 99 | 
100 | def get_best_hyperparams(hyperparams_list):
101 |     max_prec = hyperparams_list[0]['precision']
102 |     final_hyperparam_set = hyperparams_list[0]['hyperparams']
103 |     for e in hyperparams_list:
104 |         if e['precision'] > max_prec:
105 |             max_prec = e['precision']
106 |             final_hyperparam_set = e['hyperparams']
107 |     return final_hyperparam_set
108 | 
109 | 
110 | 
111 | 
112 | 
113 | # bayes opt grid search XGboost
114 | def grid_xgb_py (regressors,labels):
115 |     #function for the maximization of the target
116 |     def xgb_cl_bo(max_depth,n_estimators,colsample_bytree,eta,gamma,min_child_weight):
117 |         params_xgb={}
118 |         params_xgb['max_depth'] = int(max_depth)
119 |         params_xgb['n_estimators'] = int(n_estimators)
120 |         params_xgb['colsample_bytree']=colsample_bytree
121 |         params_xgb['min_child_weight'] = int(min_child_weight)
122 |         params_xgb['eta']=eta
123 |         params_xgb['gamma']=gamma
124 |         classifier = xgb.XGBClassifier(random_state=123,n_estimators=params_xgb['n_estimators'],max_depth=params_xgb['max_depth'],gamma=params_xgb['gamma'],eta=params_xgb['eta'],colsample_bytree=params_xgb['colsample_bytree'],min_child_weight=params_xgb['min_child_weight'])
125 |         #scores=cross_val_score(classifier,regressors,labels[:,0].astype('int'),cv=5,scoring='precision',n_jobs=-1)
126 |         #target=scores.mean()
127 |         scoring = {'rec': 'recall',
128 |            'prec': 'precision' }
129 |         scores = cross_validate(classifier,regressors,labels[:,0].astype('int'), scoring=scoring,
130 |                          cv=5, return_train_score=True,n_jobs=-1)
131 |         print('recall',round(scores['test_rec'].mean(),2)) 
132 |         print('precision train',round(scores['train_prec'].mean(),2))  
133 |         target=scores['test_prec'].mean()
134 |         return (target)
135 |     params_xgb ={
136 |         'max_depth':(2, 4),
137 |         'n_estimators':(64,256), 
138 |         'min_child_weight':(8,16), 
139 |         'gamma':(0.6,1.2),
140 |         'eta':(0.08,0.16),
141 |         'colsample_bytree':(0.1,0.3)
142 |     }
143 |     xgb_bo = BayesianOptimization(xgb_cl_bo, params_xgb, random_state=111,verbose=1)
144 |     xgb_bo.maximize(init_points=25, n_iter=5)
145 |     print(xgb_bo.max)
146 |     params_xgb = xgb_bo.max['params']
147 |     params_xgb={}
148 |     params_xgb['n_estimators']= int(xgb_bo.max["params"]["n_estimators"])
149 |     params_xgb["max_depth"] = int(xgb_bo.max["params"]["max_depth"])
150 |     params_xgb['min_child_weight']= int(xgb_bo.max["params"]["min_child_weight"])
151 |     params_xgb['eta']=xgb_bo.max['params']['eta']
152 |     params_xgb['gamma']=xgb_bo.max['params']['gamma']
153 |     params_xgb['colsample_bytree']=xgb_bo.max['params']['colsample_bytree']
154 |     #print(params_tree)
155 |     return (params_xgb) 
156 | 
157 | 
158 | 
159 | '''
160 | Decision Tree
161 | 
162 | '''
163 | 
164 | 
165 | def evaluation_decision_tree(database):
166 |     
167 |     database = database.loc[:, ~database.columns.str.contains('^Unnamed')]
168 |     database['Package Repository'] = np.where(database['Package Repository'] == "NPM", 1, 2)
169 |     # Dict to set set of params and related precision
170 |     hyperpar_list = []
171 |     # define a list for the evaluation metrics 
172 |     evaluation=['precision','recall','f1', 'accuracy','false positive','false negative','true negative','true positive','precision_npm','recall_npm','f1_npm', 'acc_npm','precision_pypi','recall_pypi','f1_pypi','acc_pypi']
173 |     # define a list for the features 
174 |     #f = [c for c in database.columns if c not in ['Malicious','Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','Package Repository','Package Name']]#,'Number of lines in metadata','Number of Words in metadata'
175 |     database = database.loc[:, ~database.columns.str.contains('^Unnamed')]
176 | 
177 |     f = [c for c in database.columns if c not in ['Malicious','Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','Unnamed: 0.1.1.1','Package Repository','Package Name']]#,'Number of lines in metadata','Number of Words in metadata'
178 |     # initialize the dataframe
179 |     eval = pd.DataFrame(data=None, index=[y for x in [f,evaluation] for y in x])
180 |     random_split=[123,333,567,999,876,371,459,111,902,724]
181 |     for i in range(0,10):
182 |         #split
183 |         split_=split_training_testing(database[database['Package Repository']!=3], test_size=0.2, random=random_split[i])
184 |         # optimization of the hyperparameters with cross-validation in the train set 
185 |         train_rf_=grid_tree(split_[0], split_[2])
186 |         # fit the model with the best hyperparameters 
187 |         classifier = DecisionTreeClassifier(random_state=123,criterion=train_rf_['criterion'],max_depth=train_rf_['max_depth'],max_features=train_rf_['max_features'],min_samples_leaf=train_rf_['min_sample_leaf'],min_samples_split=train_rf_['min_sample_split'])
188 |         classifier.fit(split_[0], split_[2][:,0].astype('int'))
189 |         #predict on test data 
190 |         y_pred_test_=classifier.predict(split_[1])
191 |         hyperpar_list.append({'precision':round(precision_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2),'hyperparams':train_rf_})
192 |         # array for features,precison,recall,f1
193 |         precision=np.append(classifier.feature_importances_,round(precision_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2))
194 |         recall=np.append(precision,round(recall_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2))
195 |         f1=np.append(recall,round(f1_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2))
196 |         acc=np.append(f1,round(accuracy_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2))
197 |         # false positive, false negative, true negative, true positive 
198 |         tn,fp,fn,tp = confusion_matrix(split_[3][:,0].astype('int'),y_pred_test_).ravel()
199 |         false_positive=np.append(acc,fp)
200 |         false_negative=np.append(false_positive,fn)
201 |         true_negative=np.append(false_negative,tn)
202 |         true_positive=np.append(true_negative,tp)
203 |         # evaluation group by repository 
204 |         repository=np.concatenate((split_[3][:,0].astype('int').reshape(len(split_[3][:,0].astype('int')),1), y_pred_test_.reshape(len(y_pred_test_),1),split_[3][:,1].astype('int').reshape(len(split_[3][:,1].astype('int')),1)), axis=1, out=None)
205 |         npm=repository[repository[:,2] == 1]
206 |         pypi=repository[repository[:,2] == 2]
207 |         rubygems=repository[repository[:,2] == 3]
208 |         # precision, recall, f1 for npm 
209 |         precision_npm=np.append(true_positive,round(precision_score(npm[:,0], npm[:,1])*100,2))
210 |         recall_npm=np.append(precision_npm,round(recall_score(npm[:,0], npm[:,1])*100,2))
211 |         f1_npm=np.append(recall_npm,round(f1_score(npm[:,0], npm[:,1])*100,2))
212 |         acc_npm=np.append(f1_npm,round(accuracy_score(npm[:,0], npm[:,1])*100,2))
213 |         # precision, recall, f1 for pypi
214 |         precision_pypi=np.append(acc_npm,round(precision_score(pypi[:,0], pypi[:,1])*100,2))
215 |         recall_pypi=np.append(precision_pypi,round(recall_score(pypi[:,0], pypi[:,1])*100,2))
216 |         f1_pypi=np.append(recall_pypi,round(f1_score(pypi[:,0], pypi[:,1])*100,2))
217 |         metrics=np.append(f1_pypi,round(accuracy_score(pypi[:,0], pypi[:,1])*100,2))
218 |         eval[i]=metrics.tolist()
219 |     # replace 0, with NaN 
220 |     eval=eval.replace(0,np.nan)
221 |     mean=eval.mean(axis=1)
222 |     std=eval.std(axis=1)
223 |     result=pd.concat([mean, std], axis=1)
224 |     return(result, get_best_hyperparams(hyperpar_list))
225 | 
226 | # bayes opt grid search decision tree 
227 | def grid_tree (regressors,labels):
228 |     # grid for the quality of the split 
229 |     criteria=['gini', 'entropy', 'log_loss'] # 0,1,2
230 |     number_features=['sqrt','log2',None]
231 |     #function for the maximization of the target
232 |     def tree_cl_bo(max_depth, max_features,criterion,min_sample_leaf,min_sample_split):
233 |         params_tree={}
234 |         params_tree['max_depth'] = int(max_depth)
235 |         params_tree['max_features'] = number_features[int(max_features)]
236 |         params_tree['criterion']=criteria[int(criterion)]
237 |         params_tree['min_sample_leaf']=int(min_sample_leaf)
238 |         params_tree['min_sample_split']=int(min_sample_split)
239 |         classifier = DecisionTreeClassifier(random_state=123,criterion=params_tree['criterion'],max_depth=params_tree['max_depth'],min_samples_leaf=params_tree['min_sample_leaf'],max_features=params_tree['max_features'],min_samples_split=params_tree['min_sample_split'])
240 |         #scores=cross_val_score(classifier,regressors,labels[:,0].astype('int'),cv=5,scoring='precision',n_jobs=-1)
241 |         #target=scores.mean()
242 |         scoring = {'rec': 'recall',
243 |            'prec': 'precision'}
244 |         scores = cross_validate(classifier,regressors,labels[:,0].astype('int'), scoring=scoring,
245 |                          cv=5, return_train_score=True,n_jobs=-1)
246 |         print('recall',round(scores['test_rec'].mean(),2))
247 |         print('precision train',round(scores['train_prec'].mean(),2))   
248 |         target=scores['test_prec'].mean()
249 |         return (target)
250 |     params_tree ={
251 |         'max_depth':(2, 4),
252 |         'max_features':(0,2.99), 
253 |         'criterion':(0,2.99), # int 0,1,2
254 |         'min_sample_leaf':(4,8),
255 |         'min_sample_split':(6,16)
256 |     }
257 |     tree_bo = BayesianOptimization(tree_cl_bo, params_tree, random_state=111)
258 |     tree_bo.maximize(init_points=25, n_iter=5)
259 |     print(tree_bo.max)
260 |     params_tree = tree_bo.max['params']
261 |     params_tree={}
262 |     params_tree["max_features"]=number_features[int(tree_bo.max["params"]["max_features"])]
263 |     params_tree["max_depth"] = int(tree_bo.max["params"]["max_depth"])
264 |     params_tree['criterion']= criteria[int(tree_bo.max["params"]["criterion"])]
265 |     params_tree['min_sample_leaf']=int(tree_bo.max['params']['min_sample_leaf'])
266 |     params_tree['min_sample_split']=int(tree_bo.max['params']['min_sample_split'])
267 |     #print(params_tree)
268 |     return (params_tree)  
269 | 
270 | 
271 | 
272 | 
273 | '''
274 | Random Forest
275 | 
276 | '''
277 | 
278 | def evaluation_random_forest(database):
279 |     database = database.loc[:, ~database.columns.str.contains('^Unnamed')]
280 |     database['Package Repository'] = np.where(database['Package Repository'] == "NPM", 1, 2)
281 | 
282 |     # Dict to set set of params and related precision
283 |     hyperpar_list = [] 
284 |     # define a list for the evaluation metrics 
285 |     evaluation=['precision','recall','f1', 'accuracy','false positive','false negative','true negative','true positive','precision_npm','recall_npm','f1_npm', 'acc_npm','precision_pypi','recall_pypi','f1_pypi','acc_pypi']
286 |     # define a list for the features 
287 |     f = [c for c in database.columns if c not in ['Malicious','Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','Package Repository','Package Name']]#,'Number of lines in metadata','Number of Words in metadata'
288 |     # initialize the dataframe
289 |     eval = pd.DataFrame(data=None, index=[y for x in [f,evaluation] for y in x])
290 |     random_split=[123,333,567,999,876,371,459,111,902,724]
291 |     for i in range(0,10):
292 |         #split
293 |         split_=split_training_testing(database[database['Package Repository']!=3], test_size=0.2, random=random_split[i])
294 |         # optimization of the hyperparameters with cross-validation in the train set 
295 |         train_rf_=grid_rf(split_[0], split_[2])
296 |         # fit the model with the best hyperparameters 
297 |         classifier = RandomForestClassifier(random_state=123,criterion=train_rf_['criterion'],n_estimators=train_rf_['n_estimators'],max_depth=train_rf_['max_depth'],max_features=train_rf_['max_features'],min_samples_leaf=train_rf_['min_sample_leaf'],min_samples_split=train_rf_['min_sample_split'],max_samples=train_rf_['max_samples'])
298 |         classifier.fit(split_[0], split_[2][:,0].astype('int'))
299 |         #predict on test data 
300 |         y_pred_test_=classifier.predict(split_[1])
301 | 
302 |         hyperpar_list.append({'precision':round(precision_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2),'hyperparams':train_rf_})
303 |         # array for features,precison,recall,f1
304 |         precision=np.append(classifier.feature_importances_,round(precision_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2))
305 |         recall=np.append(precision,round(recall_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2))
306 |         f1=np.append(recall,round(f1_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2))
307 |         acc=np.append(f1,round(accuracy_score(split_[3][:,0].astype('int'), y_pred_test_)*100,2))
308 |         # false positive, false negative, true negative, true positive 
309 |         tn,fp,fn,tp = confusion_matrix(split_[3][:,0].astype('int'),y_pred_test_).ravel()
310 |         false_positive=np.append(acc,fp)
311 |         false_negative=np.append(false_positive,fn)
312 |         true_negative=np.append(false_negative,tn)
313 |         true_positive=np.append(true_negative,tp)
314 |         # evaluation group by repository 
315 |         repository=np.concatenate((split_[3][:,0].astype('int').reshape(len(split_[3][:,0].astype('int')),1), y_pred_test_.reshape(len(y_pred_test_),1),split_[3][:,1].astype('int').reshape(len(split_[3][:,1].astype('int')),1)), axis=1, out=None)
316 |         npm=repository[repository[:,2] == 1]
317 |         pypi=repository[repository[:,2] == 2]
318 |         rubygems=repository[repository[:,2] == 3]
319 |         # precision, recall, f1 for npm 
320 |         precision_npm=np.append(true_positive,round(precision_score(npm[:,0], npm[:,1])*100,2))
321 |         recall_npm=np.append(precision_npm,round(recall_score(npm[:,0], npm[:,1])*100,2))
322 |         f1_npm=np.append(recall_npm,round(f1_score(npm[:,0], npm[:,1])*100,2))
323 |         acc_npm=np.append(f1_npm,round(accuracy_score(npm[:,0], npm[:,1])*100,2))
324 |         # precision, recall, f1 for pypi
325 |         precision_pypi=np.append(acc_npm,round(precision_score(pypi[:,0], pypi[:,1])*100,2))
326 |         recall_pypi=np.append(precision_pypi,round(recall_score(pypi[:,0], pypi[:,1])*100,2))
327 |         f1_pypi=np.append(recall_pypi,round(f1_score(pypi[:,0], pypi[:,1])*100,2))
328 |         metrics=np.append(f1_pypi,round(accuracy_score(pypi[:,0], pypi[:,1])*100,2))
329 |         
330 |         eval[i]=metrics.tolist()
331 |     # replace 0, with NaN 
332 |     eval=eval.replace(0,np.nan)
333 |     mean=eval.mean(axis=1)
334 |     std=eval.std(axis=1)
335 |     result=pd.concat([mean, std], axis=1)
336 |     return(result, get_best_hyperparams(hyperpar_list))
337 | 
338 | 
339 | # bayes opt grid search RANDOM FOREST
340 | def grid_rf (regressors,labels):
341 |     # grid for the quality of the split 
342 |     criteria=['gini', 'entropy', 'log_loss'] # 0,1,2
343 |     number_features=['sqrt','log2',None]
344 |     #function for the maximization of the target
345 |     def rf_cl_bo(max_depth, max_features,n_estimators,criterion,min_sample_leaf,min_sample_split,max_samples):
346 |         params_rf={}
347 |         params_rf['max_depth'] = int(max_depth)
348 |         params_rf['max_features'] = number_features[int(max_features)]
349 |         params_rf['criterion']=criteria[int(criterion)]
350 |         params_rf['n_estimators'] = int(n_estimators)
351 |         params_rf['min_sample_leaf']=int(min_sample_leaf)
352 |         params_rf['min_sample_split']=int(min_sample_split)
353 |         params_rf['max_samples']=max_samples
354 |         classifier = RandomForestClassifier(random_state=123,criterion=params_rf['criterion'],n_estimators=params_rf['n_estimators'],max_depth=params_rf['max_depth'],min_samples_leaf=params_rf['min_sample_leaf'],max_features=params_rf['max_features'],max_samples=params_rf['max_samples'],min_samples_split=params_rf['min_sample_split'])
355 |         #scores=cross_val_score(classifier,regressors,labels[:,0].astype('int'),cv=5,scoring='precision',n_jobs=-1)
356 |         scoring = {'rec': 'recall',
357 |            'prec': 'precision'}
358 |         scores = cross_validate(classifier,regressors,labels[:,0].astype('int'), scoring=scoring,
359 |                          cv=5, return_train_score=True,n_jobs=-1)
360 |         print('recall',round(scores['test_rec'].mean(),2))  
361 |         print('precision train',round(scores['train_prec'].mean(),2)) 
362 |         target=scores['test_prec'].mean()
363 |         return (target)
364 |     params_rf ={
365 |         'max_depth':(2, 4),
366 |         'max_features':(0,2.99),
367 |         'n_estimators':(64,256), 
368 |         'criterion':(0,2.99), # int 0,1,2
369 |         'min_sample_leaf':(4,8),
370 |         'min_sample_split':(6,16),
371 |         'max_samples':(0.1,1)
372 |     }
373 |     rf_bo = BayesianOptimization(rf_cl_bo, params_rf, random_state=111)
374 |     rf_bo.maximize(init_points=25, n_iter=5)
375 |     print(rf_bo.max)
376 |     params_rf = rf_bo.max['params']
377 |     params_rf={}
378 |     params_rf['n_estimators']= int(rf_bo.max["params"]["n_estimators"])
379 |     params_rf["max_features"]=number_features[int(rf_bo.max["params"]["max_features"])]
380 |     params_rf["max_depth"] = int(rf_bo.max["params"]["max_depth"])
381 |     params_rf['criterion']= criteria[int(rf_bo.max["params"]["criterion"])]
382 |     params_rf['min_sample_leaf']=int(rf_bo.max['params']['min_sample_leaf'])
383 |     params_rf['min_sample_split']=int(rf_bo.max['params']['min_sample_split'])
384 |     params_rf['max_samples']=rf_bo.max['params']['max_samples']
385 |     #print(params_tree)
386 |     return (params_rf)  


--------------------------------------------------------------------------------