├── .gitignore ├── LICENSE ├── README.md ├── data ├── airline_tweets.csv ├── amazon │ ├── xaa.csv │ ├── xab.csv │ └── xac.csv ├── example1.txt ├── example2.txt ├── example3.txt └── sowing_and_reaping.txt ├── images ├── bow.png ├── lda.png ├── nmf.png └── word2vec.png ├── lessons ├── 01_preprocessing.ipynb ├── 02_bag_of_words.ipynb ├── 03_word_embeddings.ipynb └── 04_topic_modeling.ipynb └── solutions ├── 01_preprocessing_solutions.ipynb ├── 02_bag_of_words_solutions.ipynb ├── 03_word_embeddings_solutions.ipynb └── 04_topic_modeling_solutions.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/macos,windows,jupyternotebooks,python,visualstudiocode 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=macos,windows,jupyternotebooks,python,visualstudiocode 4 | 5 | ### JupyterNotebooks ### 6 | # gitignore template for Jupyter Notebooks 7 | # website: http://jupyter.org/ 8 | 9 | .ipynb_checkpoints 10 | */.ipynb_checkpoints/* 11 | 12 | # IPython 13 | profile_default/ 14 | ipython_config.py 15 | 16 | # Remove previous ipynb_checkpoints 17 | # git rm -r .ipynb_checkpoints/ 18 | 19 | ### macOS ### 20 | # General 21 | .DS_Store 22 | .AppleDouble 23 | .LSOverride 24 | 25 | # Icon must end with two \r 26 | Icon 27 | 28 | 29 | # Thumbnails 30 | ._* 31 | 32 | # Files that might appear in the root of a volume 33 | .DocumentRevisions-V100 34 | .fseventsd 35 | .Spotlight-V100 36 | .TemporaryItems 37 | .Trashes 38 | .VolumeIcon.icns 39 | .com.apple.timemachine.donotpresent 40 | 41 | # Directories potentially created on remote AFP share 42 | .AppleDB 43 | .AppleDesktop 44 | Network Trash Folder 45 | Temporary Items 46 | .apdisk 47 | 48 | ### Python ### 49 | # Byte-compiled / optimized / DLL files 50 | __pycache__/ 51 | *.py[cod] 52 | *$py.class 53 | 54 | # C extensions 55 | *.so 56 | 57 | # Distribution / packaging 58 | .Python 59 | build/ 60 | develop-eggs/ 61 | dist/ 62 | downloads/ 63 | eggs/ 64 | .eggs/ 65 | lib/ 66 | lib64/ 67 | parts/ 68 | sdist/ 69 | var/ 70 | wheels/ 71 | share/python-wheels/ 72 | *.egg-info/ 73 | .installed.cfg 74 | *.egg 75 | MANIFEST 76 | 77 | # PyInstaller 78 | # Usually these files are written by a python script from a template 79 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 80 | *.manifest 81 | *.spec 82 | 83 | # Installer logs 84 | pip-log.txt 85 | pip-delete-this-directory.txt 86 | 87 | # Unit test / coverage reports 88 | htmlcov/ 89 | .tox/ 90 | .nox/ 91 | .coverage 92 | .coverage.* 93 | .cache 94 | nosetests.xml 95 | coverage.xml 96 | *.cover 97 | *.py,cover 98 | .hypothesis/ 99 | .pytest_cache/ 100 | cover/ 101 | 102 | # Translations 103 | *.mo 104 | *.pot 105 | 106 | # Django stuff: 107 | *.log 108 | local_settings.py 109 | db.sqlite3 110 | db.sqlite3-journal 111 | 112 | # Flask stuff: 113 | instance/ 114 | .webassets-cache 115 | 116 | # Scrapy stuff: 117 | .scrapy 118 | 119 | # Sphinx documentation 120 | docs/_build/ 121 | 122 | # PyBuilder 123 | .pybuilder/ 124 | target/ 125 | 126 | # Jupyter Notebook 127 | 128 | # IPython 129 | 130 | # pyenv 131 | # For a library or package, you might want to ignore these files since the code is 132 | # intended to run in multiple environments; otherwise, check them in: 133 | # .python-version 134 | 135 | # pipenv 136 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 137 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 138 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 139 | # install all needed dependencies. 140 | #Pipfile.lock 141 | 142 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 143 | __pypackages__/ 144 | 145 | # Celery stuff 146 | celerybeat-schedule 147 | celerybeat.pid 148 | 149 | # SageMath parsed files 150 | *.sage.py 151 | 152 | # Environments 153 | .env 154 | .venv 155 | env/ 156 | venv/ 157 | ENV/ 158 | env.bak/ 159 | venv.bak/ 160 | 161 | # Spyder project settings 162 | .spyderproject 163 | .spyproject 164 | 165 | # Rope project settings 166 | .ropeproject 167 | 168 | # mkdocs documentation 169 | /site 170 | 171 | # mypy 172 | .mypy_cache/ 173 | .dmypy.json 174 | dmypy.json 175 | 176 | # Pyre type checker 177 | .pyre/ 178 | 179 | # pytype static type analyzer 180 | .pytype/ 181 | 182 | # Cython debug symbols 183 | cython_debug/ 184 | 185 | ### VisualStudioCode ### 186 | .vscode/* 187 | !.vscode/settings.json 188 | !.vscode/tasks.json 189 | !.vscode/launch.json 190 | !.vscode/extensions.json 191 | *.code-workspace 192 | 193 | # Local History for Visual Studio Code 194 | .history/ 195 | 196 | ### VisualStudioCode Patch ### 197 | # Ignore all local history of files 198 | .history 199 | .ionide 200 | 201 | # Support for Project snippet scope 202 | !.vscode/*.code-snippets 203 | 204 | ### Windows ### 205 | # Windows thumbnail cache files 206 | Thumbs.db 207 | Thumbs.db:encryptable 208 | ehthumbs.db 209 | ehthumbs_vista.db 210 | 211 | # Dump file 212 | *.stackdump 213 | 214 | # Folder config file 215 | [Dd]esktop.ini 216 | 217 | # Recycle Bin used on file shares 218 | $RECYCLE.BIN/ 219 | 220 | # Windows Installer files 221 | *.cab 222 | *.msi 223 | *.msix 224 | *.msm 225 | *.msp 226 | 227 | # Windows shortcuts 228 | *.lnk 229 | 230 | # End of https://www.toptal.com/developers/gitignore/api/macos,windows,jupyternotebooks,python,visualstudiocode 231 | 232 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution 4.0 International Public License 58 | 59 | By exercising the Licensed Rights (defined below), You accept and agree 60 | to be bound by the terms and conditions of this Creative Commons 61 | Attribution 4.0 International Public License ("Public License"). To the 62 | extent this Public License may be interpreted as a contract, You are 63 | granted the Licensed Rights in consideration of Your acceptance of 64 | these terms and conditions, and the Licensor grants You such rights in 65 | consideration of benefits the Licensor receives from making the 66 | Licensed Material available under these terms and conditions. 67 | 68 | 69 | Section 1 -- Definitions. 70 | 71 | a. Adapted Material means material subject to Copyright and Similar 72 | Rights that is derived from or based upon the Licensed Material 73 | and in which the Licensed Material is translated, altered, 74 | arranged, transformed, or otherwise modified in a manner requiring 75 | permission under the Copyright and Similar Rights held by the 76 | Licensor. For purposes of this Public License, where the Licensed 77 | Material is a musical work, performance, or sound recording, 78 | Adapted Material is always produced where the Licensed Material is 79 | synched in timed relation with a moving image. 80 | 81 | b. Adapter's License means the license You apply to Your Copyright 82 | and Similar Rights in Your contributions to Adapted Material in 83 | accordance with the terms and conditions of this Public License. 84 | 85 | c. Copyright and Similar Rights means copyright and/or similar rights 86 | closely related to copyright including, without limitation, 87 | performance, broadcast, sound recording, and Sui Generis Database 88 | Rights, without regard to how the rights are labeled or 89 | categorized. For purposes of this Public License, the rights 90 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 91 | Rights. 92 | 93 | d. Effective Technological Measures means those measures that, in the 94 | absence of proper authority, may not be circumvented under laws 95 | fulfilling obligations under Article 11 of the WIPO Copyright 96 | Treaty adopted on December 20, 1996, and/or similar international 97 | agreements. 98 | 99 | e. Exceptions and Limitations means fair use, fair dealing, and/or 100 | any other exception or limitation to Copyright and Similar Rights 101 | that applies to Your use of the Licensed Material. 102 | 103 | f. Licensed Material means the artistic or literary work, database, 104 | or other material to which the Licensor applied this Public 105 | License. 106 | 107 | g. Licensed Rights means the rights granted to You subject to the 108 | terms and conditions of this Public License, which are limited to 109 | all Copyright and Similar Rights that apply to Your use of the 110 | Licensed Material and that the Licensor has authority to license. 111 | 112 | h. Licensor means the individual(s) or entity(ies) granting rights 113 | under this Public License. 114 | 115 | i. Share means to provide material to the public by any means or 116 | process that requires permission under the Licensed Rights, such 117 | as reproduction, public display, public performance, distribution, 118 | dissemination, communication, or importation, and to make material 119 | available to the public including in ways that members of the 120 | public may access the material from a place and at a time 121 | individually chosen by them. 122 | 123 | j. Sui Generis Database Rights means rights other than copyright 124 | resulting from Directive 96/9/EC of the European Parliament and of 125 | the Council of 11 March 1996 on the legal protection of databases, 126 | as amended and/or succeeded, as well as other essentially 127 | equivalent rights anywhere in the world. 128 | 129 | k. You means the individual or entity exercising the Licensed Rights 130 | under this Public License. Your has a corresponding meaning. 131 | 132 | 133 | Section 2 -- Scope. 134 | 135 | a. License grant. 136 | 137 | 1. Subject to the terms and conditions of this Public License, 138 | the Licensor hereby grants You a worldwide, royalty-free, 139 | non-sublicensable, non-exclusive, irrevocable license to 140 | exercise the Licensed Rights in the Licensed Material to: 141 | 142 | a. reproduce and Share the Licensed Material, in whole or 143 | in part; and 144 | 145 | b. produce, reproduce, and Share Adapted Material. 146 | 147 | 2. Exceptions and Limitations. For the avoidance of doubt, where 148 | Exceptions and Limitations apply to Your use, this Public 149 | License does not apply, and You do not need to comply with 150 | its terms and conditions. 151 | 152 | 3. Term. The term of this Public License is specified in Section 153 | 6(a). 154 | 155 | 4. Media and formats; technical modifications allowed. The 156 | Licensor authorizes You to exercise the Licensed Rights in 157 | all media and formats whether now known or hereafter created, 158 | and to make technical modifications necessary to do so. The 159 | Licensor waives and/or agrees not to assert any right or 160 | authority to forbid You from making technical modifications 161 | necessary to exercise the Licensed Rights, including 162 | technical modifications necessary to circumvent Effective 163 | Technological Measures. For purposes of this Public License, 164 | simply making modifications authorized by this Section 2(a) 165 | (4) never produces Adapted Material. 166 | 167 | 5. Downstream recipients. 168 | 169 | a. Offer from the Licensor -- Licensed Material. Every 170 | recipient of the Licensed Material automatically 171 | receives an offer from the Licensor to exercise the 172 | Licensed Rights under the terms and conditions of this 173 | Public License. 174 | 175 | b. No downstream restrictions. You may not offer or impose 176 | any additional or different terms or conditions on, or 177 | apply any Effective Technological Measures to, the 178 | Licensed Material if doing so restricts exercise of the 179 | Licensed Rights by any recipient of the Licensed 180 | Material. 181 | 182 | 6. No endorsement. Nothing in this Public License constitutes or 183 | may be construed as permission to assert or imply that You 184 | are, or that Your use of the Licensed Material is, connected 185 | with, or sponsored, endorsed, or granted official status by, 186 | the Licensor or others designated to receive attribution as 187 | provided in Section 3(a)(1)(A)(i). 188 | 189 | b. Other rights. 190 | 191 | 1. Moral rights, such as the right of integrity, are not 192 | licensed under this Public License, nor are publicity, 193 | privacy, and/or other similar personality rights; however, to 194 | the extent possible, the Licensor waives and/or agrees not to 195 | assert any such rights held by the Licensor to the limited 196 | extent necessary to allow You to exercise the Licensed 197 | Rights, but not otherwise. 198 | 199 | 2. Patent and trademark rights are not licensed under this 200 | Public License. 201 | 202 | 3. To the extent possible, the Licensor waives any right to 203 | collect royalties from You for the exercise of the Licensed 204 | Rights, whether directly or through a collecting society 205 | under any voluntary or waivable statutory or compulsory 206 | licensing scheme. In all other cases the Licensor expressly 207 | reserves any right to collect such royalties. 208 | 209 | 210 | Section 3 -- License Conditions. 211 | 212 | Your exercise of the Licensed Rights is expressly made subject to the 213 | following conditions. 214 | 215 | a. Attribution. 216 | 217 | 1. If You Share the Licensed Material (including in modified 218 | form), You must: 219 | 220 | a. retain the following if it is supplied by the Licensor 221 | with the Licensed Material: 222 | 223 | i. identification of the creator(s) of the Licensed 224 | Material and any others designated to receive 225 | attribution, in any reasonable manner requested by 226 | the Licensor (including by pseudonym if 227 | designated); 228 | 229 | ii. a copyright notice; 230 | 231 | iii. a notice that refers to this Public License; 232 | 233 | iv. a notice that refers to the disclaimer of 234 | warranties; 235 | 236 | v. a URI or hyperlink to the Licensed Material to the 237 | extent reasonably practicable; 238 | 239 | b. indicate if You modified the Licensed Material and 240 | retain an indication of any previous modifications; and 241 | 242 | c. indicate the Licensed Material is licensed under this 243 | Public License, and include the text of, or the URI or 244 | hyperlink to, this Public License. 245 | 246 | 2. You may satisfy the conditions in Section 3(a)(1) in any 247 | reasonable manner based on the medium, means, and context in 248 | which You Share the Licensed Material. For example, it may be 249 | reasonable to satisfy the conditions by providing a URI or 250 | hyperlink to a resource that includes the required 251 | information. 252 | 253 | 3. If requested by the Licensor, You must remove any of the 254 | information required by Section 3(a)(1)(A) to the extent 255 | reasonably practicable. 256 | 257 | 4. If You Share Adapted Material You produce, the Adapter's 258 | License You apply must not prevent recipients of the Adapted 259 | Material from complying with this Public License. 260 | 261 | 262 | Section 4 -- Sui Generis Database Rights. 263 | 264 | Where the Licensed Rights include Sui Generis Database Rights that 265 | apply to Your use of the Licensed Material: 266 | 267 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 268 | to extract, reuse, reproduce, and Share all or a substantial 269 | portion of the contents of the database; 270 | 271 | b. if You include all or a substantial portion of the database 272 | contents in a database in which You have Sui Generis Database 273 | Rights, then the database in which You have Sui Generis Database 274 | Rights (but not its individual contents) is Adapted Material; and 275 | 276 | c. You must comply with the conditions in Section 3(a) if You Share 277 | all or a substantial portion of the contents of the database. 278 | 279 | For the avoidance of doubt, this Section 4 supplements and does not 280 | replace Your obligations under this Public License where the Licensed 281 | Rights include other Copyright and Similar Rights. 282 | 283 | 284 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 285 | 286 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 287 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 288 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 289 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 290 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 291 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 292 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 293 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 294 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 295 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 296 | 297 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 298 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 299 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 300 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 301 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 302 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 303 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 304 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 305 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 306 | 307 | c. The disclaimer of warranties and limitation of liability provided 308 | above shall be interpreted in a manner that, to the extent 309 | possible, most closely approximates an absolute disclaimer and 310 | waiver of all liability. 311 | 312 | 313 | Section 6 -- Term and Termination. 314 | 315 | a. This Public License applies for the term of the Copyright and 316 | Similar Rights licensed here. However, if You fail to comply with 317 | this Public License, then Your rights under this Public License 318 | terminate automatically. 319 | 320 | b. Where Your right to use the Licensed Material has terminated under 321 | Section 6(a), it reinstates: 322 | 323 | 1. automatically as of the date the violation is cured, provided 324 | it is cured within 30 days of Your discovery of the 325 | violation; or 326 | 327 | 2. upon express reinstatement by the Licensor. 328 | 329 | For the avoidance of doubt, this Section 6(b) does not affect any 330 | right the Licensor may have to seek remedies for Your violations 331 | of this Public License. 332 | 333 | c. For the avoidance of doubt, the Licensor may also offer the 334 | Licensed Material under separate terms or conditions or stop 335 | distributing the Licensed Material at any time; however, doing so 336 | will not terminate this Public License. 337 | 338 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 339 | License. 340 | 341 | 342 | Section 7 -- Other Terms and Conditions. 343 | 344 | a. The Licensor shall not be bound by any additional or different 345 | terms or conditions communicated by You unless expressly agreed. 346 | 347 | b. Any arrangements, understandings, or agreements regarding the 348 | Licensed Material not stated herein are separate from and 349 | independent of the terms and conditions of this Public License. 350 | 351 | 352 | Section 8 -- Interpretation. 353 | 354 | a. For the avoidance of doubt, this Public License does not, and 355 | shall not be interpreted to, reduce, limit, restrict, or impose 356 | conditions on any use of the Licensed Material that could lawfully 357 | be made without permission under this Public License. 358 | 359 | b. To the extent possible, if any provision of this Public License is 360 | deemed unenforceable, it shall be automatically reformed to the 361 | minimum extent necessary to make it enforceable. If the provision 362 | cannot be reformed, it shall be severed from this Public License 363 | without affecting the enforceability of the remaining terms and 364 | conditions. 365 | 366 | c. No term or condition of this Public License will be waived and no 367 | failure to comply consented to unless expressly agreed to by the 368 | Licensor. 369 | 370 | d. Nothing in this Public License constitutes or may be interpreted 371 | as a limitation upon, or waiver of, any privileges and immunities 372 | that apply to the Licensor or You, including from the legal 373 | processes of any jurisdiction or authority. 374 | 375 | 376 | ======================================================================= 377 | 378 | Creative Commons is not a party to its public 379 | licenses. Notwithstanding, Creative Commons may elect to apply one of 380 | its public licenses to material it publishes and in those instances 381 | will be considered the “Licensor.” The text of the Creative Commons 382 | public licenses is dedicated to the public domain under the CC0 Public 383 | Domain Dedication. Except for the limited purpose of indicating that 384 | material is shared under a Creative Commons public license or as 385 | otherwise permitted by the Creative Commons policies published at 386 | creativecommons.org/policies, Creative Commons does not authorize the 387 | use of the trademark "Creative Commons" or any other trademark or logo 388 | of Creative Commons without its prior written consent including, 389 | without limitation, in connection with any unauthorized modifications 390 | to any of its public licenses or any other arrangements, 391 | understandings, or agreements concerning use of licensed material. For 392 | the avoidance of doubt, this paragraph does not form part of the 393 | public licenses. 394 | 395 | Creative Commons may be contacted at creativecommons.org. 396 | 397 | 398 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # D-Lab Python Text Analysis Workshop 2 | 3 | [![Datahub](https://img.shields.io/badge/launch-datahub-blue)](https://dlab.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fdlab-berkeley%2FPython-Text-Analysis&urlpath=lab%2Ftree%2FPython-Text-Analysis%2F&branch=main) 4 | [![Binder](https://img.shields.io/badge/launch-binder-579aca.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFkAAABZCAMAAABi1XidAAAB8lBMVEX///9XmsrmZYH1olJXmsr1olJXmsrmZYH1olJXmsr1olJXmsrmZYH1olL1olJXmsr1olJXmsrmZYH1olL1olJXmsrmZYH1olJXmsr1olL1olJXmsrmZYH1olL1olJXmsrmZYH1olL1olL0nFf1olJXmsrmZYH1olJXmsq8dZb1olJXmsrmZYH1olJXmspXmspXmsr1olL1olJXmsrmZYH1olJXmsr1olL1olJXmsrmZYH1olL1olLeaIVXmsrmZYH1olL1olL1olJXmsrmZYH1olLna31Xmsr1olJXmsr1olJXmsrmZYH1olLqoVr1olJXmsr1olJXmsrmZYH1olL1olKkfaPobXvviGabgadXmsqThKuofKHmZ4Dobnr1olJXmsr1olJXmspXmsr1olJXmsrfZ4TuhWn1olL1olJXmsqBi7X1olJXmspZmslbmMhbmsdemsVfl8ZgmsNim8Jpk8F0m7R4m7F5nLB6jbh7jbiDirOEibOGnKaMhq+PnaCVg6qWg6qegKaff6WhnpKofKGtnomxeZy3noG6dZi+n3vCcpPDcpPGn3bLb4/Mb47UbIrVa4rYoGjdaIbeaIXhoWHmZYHobXvpcHjqdHXreHLroVrsfG/uhGnuh2bwj2Hxk17yl1vzmljzm1j0nlX1olL3AJXWAAAAbXRSTlMAEBAQHx8gICAuLjAwMDw9PUBAQEpQUFBXV1hgYGBkcHBwcXl8gICAgoiIkJCQlJicnJ2goKCmqK+wsLC4usDAwMjP0NDQ1NbW3Nzg4ODi5+3v8PDw8/T09PX29vb39/f5+fr7+/z8/Pz9/v7+zczCxgAABC5JREFUeAHN1ul3k0UUBvCb1CTVpmpaitAGSLSpSuKCLWpbTKNJFGlcSMAFF63iUmRccNG6gLbuxkXU66JAUef/9LSpmXnyLr3T5AO/rzl5zj137p136BISy44fKJXuGN/d19PUfYeO67Znqtf2KH33Id1psXoFdW30sPZ1sMvs2D060AHqws4FHeJojLZqnw53cmfvg+XR8mC0OEjuxrXEkX5ydeVJLVIlV0e10PXk5k7dYeHu7Cj1j+49uKg7uLU61tGLw1lq27ugQYlclHC4bgv7VQ+TAyj5Zc/UjsPvs1sd5cWryWObtvWT2EPa4rtnWW3JkpjggEpbOsPr7F7EyNewtpBIslA7p43HCsnwooXTEc3UmPmCNn5lrqTJxy6nRmcavGZVt/3Da2pD5NHvsOHJCrdc1G2r3DITpU7yic7w/7Rxnjc0kt5GC4djiv2Sz3Fb2iEZg41/ddsFDoyuYrIkmFehz0HR2thPgQqMyQYb2OtB0WxsZ3BeG3+wpRb1vzl2UYBog8FfGhttFKjtAclnZYrRo9ryG9uG/FZQU4AEg8ZE9LjGMzTmqKXPLnlWVnIlQQTvxJf8ip7VgjZjyVPrjw1te5otM7RmP7xm+sK2Gv9I8Gi++BRbEkR9EBw8zRUcKxwp73xkaLiqQb+kGduJTNHG72zcW9LoJgqQxpP3/Tj//c3yB0tqzaml05/+orHLksVO+95kX7/7qgJvnjlrfr2Ggsyx0eoy9uPzN5SPd86aXggOsEKW2Prz7du3VID3/tzs/sSRs2w7ovVHKtjrX2pd7ZMlTxAYfBAL9jiDwfLkq55Tm7ifhMlTGPyCAs7RFRhn47JnlcB9RM5T97ASuZXIcVNuUDIndpDbdsfrqsOppeXl5Y+XVKdjFCTh+zGaVuj0d9zy05PPK3QzBamxdwtTCrzyg/2Rvf2EstUjordGwa/kx9mSJLr8mLLtCW8HHGJc2R5hS219IiF6PnTusOqcMl57gm0Z8kanKMAQg0qSyuZfn7zItsbGyO9QlnxY0eCuD1XL2ys/MsrQhltE7Ug0uFOzufJFE2PxBo/YAx8XPPdDwWN0MrDRYIZF0mSMKCNHgaIVFoBbNoLJ7tEQDKxGF0kcLQimojCZopv0OkNOyWCCg9XMVAi7ARJzQdM2QUh0gmBozjc3Skg6dSBRqDGYSUOu66Zg+I2fNZs/M3/f/Grl/XnyF1Gw3VKCez0PN5IUfFLqvgUN4C0qNqYs5YhPL+aVZYDE4IpUk57oSFnJm4FyCqqOE0jhY2SMyLFoo56zyo6becOS5UVDdj7Vih0zp+tcMhwRpBeLyqtIjlJKAIZSbI8SGSF3k0pA3mR5tHuwPFoa7N7reoq2bqCsAk1HqCu5uvI1n6JuRXI+S1Mco54YmYTwcn6Aeic+kssXi8XpXC4V3t7/ADuTNKaQJdScAAAAAElFTkSuQmCC)](https://mybinder.org/v2/gh/dlab-berkeley/Python-Text-Analysis/HEAD) 5 | [![License: CC BY 4.0](https://img.shields.io/badge/License-CC_BY_4.0-lightgrey.svg)](https://creativecommons.org/licenses/by/4.0/) 6 | 7 | This repository contains the materials for the D-Lab Python Text Analysis 8 | workshop. We recommend attending Python Fundamentals, Python Data Wrangling, and 9 | Python Machine Learning Fundamentals prior to this workshop. 10 | 11 | ## Workshop Goals 12 | 13 | This workshop is part of a loosely-coupled 4-part text analysis workshop series 14 | that will prepare participants to move forward with research that uses text 15 | analysis, with a special focus on social science applications. We explore 16 | fundamental approaches to applying computational methods to text in Python. We 17 | cover some of the major packages used in natural language processing, including 18 | scikit-learn, NLTK, spaCy, and Gensim. 19 | 20 | The first two parts are taught as a joint series. Parts 3 and 4 can be attended 21 | "a la carte". However, we recommend attending Parts 1 and 2 prior to attending 22 | Parts 3 and 4. These parts are as follows: 23 | 24 | 1. **Part 1: Preprocessing Text.** How do we standardize and clean text 25 | documents? Text data is noisy, and we often need to develop a pipeline in 26 | order to standardize the data to better facilitate computational modeling. In 27 | the first part of this workshop, we walk through possible steps in this 28 | pipeline using tools from basic Python, NLTK, and spaCy in order to 29 | preprocess and tokenize text data. 30 | 2. **Part 2: Bag-of-words Representations.** How do we convert text into a 31 | representation that we can operate on computationally? This requires 32 | developing a *numerical representation* of the text. In this part of the 33 | workshop, we study one of the foundational numerical representation of text 34 | data: the bag-of-words model. This model relies heavily on word frequencies 35 | in order to characterize text corpora. We use bag-of-words models and 36 | variations (e.g., TF-IDF) to perform sentiment classification. 37 | 3. **Part 3: Topic Modeling.** How do we identify topics within a corpus of 38 | documents? In this part, we study unsupervised learning of text data. 39 | Specifically, we use topic models such as Latent Dirichlet Allocation and 40 | Non-negative Matrix Factorization to construct "topics" in text from the 41 | statistical regularities in the data. 42 | 4. **Part 4: Word Embeddings.** How can we use neural networks to create 43 | meaningful representations of words? The bag-of-words is limited in its 44 | ability to characterize text, because it does not utilize word context. In 45 | this part, we study word embeddings, which were among the first attempts to 46 | use neural networks to develop numerical representations of text that 47 | incorporate context. We learn how to use the package Gensim to construct 48 | and explore word embeddings of text. 49 | 50 | ## Installation Instructions 51 | 52 | Anaconda is a useful package management software that allows you to run Python 53 | and Jupyter notebooks easily. Installing Anaconda is the easiest way to make 54 | sure you have all the necessary software to run the materials for this workshop. 55 | If you would like to run Python on your own computer, complete the following 56 | steps prior to the workshop: 57 | 58 | 1. [Download and install Anaconda (Python 3.9 59 | distribution)](https://www.anaconda.com/products/individual). Click the 60 | "Download" button. 61 | 62 | 2. Download the Python Text Analysis [workshop 63 | materials](https://github.com/dlab-berkeley/Python-Text-Analysis): 64 | 65 | - Click the green "Code" button in the top right of the repository 66 | information. 67 | - Click "Download Zip". 68 | - Extract this file to a folder on your computer where you can easily 69 | access it (we recommend Desktop). 70 | 71 | 3. Optional: if you're familiar with `git`, you can instead clone this 72 | repository by opening a terminal and entering the command `git clone 73 | git@github.com:dlab-berkeley/Python-Text-Analysis.git`. 74 | 75 | ## Is Python Not Working on Your Laptop? 76 | 77 | If you do not have Anaconda installed and the materials loaded on your workshop 78 | by the time it starts, we *strongly* recommend using the D-Lab Datahub to 79 | run the materials for these lessons. You can access the DataHub by clicking the 80 | following button: 81 | 82 | [![Datahub](https://img.shields.io/badge/launch-datahub-blue)](https://dlab.datahub.berkeley.edu/hub/user-redirect/git-pull?repo=https%3A%2F%2Fgithub.com%2Fdlab-berkeley%2FPython-Text-Analysis&urlpath=lab%2Ftree%2FPython-Text-Analysis%2F&branch=main) 83 | 84 | The DataHub downloads this repository, along with any necessary packages, and 85 | allows you to run the materials in a Jupyter notebook that is stored on UC 86 | Berkeley's servers. No installation is necessary from your end - you only need 87 | an internet browser and a CalNet ID to log in. By using the DataHub, you can 88 | save your work and come back to it at any time. When you want to return to your 89 | saved work, just go straight to [DataHub](https://datahub.berkeley.edu), sign 90 | in, and you click on the `Python-Text-Analysis` folder. 91 | 92 | If you don't have a Berkeley CalNet ID, you can still run these lessons in the 93 | cloud, by clicking this button: 94 | 95 | [![Binder](https://img.shields.io/badge/launch-binder-579aca.svg?logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFkAAABZCAMAAABi1XidAAAB8lBMVEX///9XmsrmZYH1olJXmsr1olJXmsrmZYH1olJXmsr1olJXmsrmZYH1olL1olJXmsr1olJXmsrmZYH1olL1olJXmsrmZYH1olJXmsr1olL1olJXmsrmZYH1olL1olJXmsrmZYH1olL1olL0nFf1olJXmsrmZYH1olJXmsq8dZb1olJXmsrmZYH1olJXmspXmspXmsr1olL1olJXmsrmZYH1olJXmsr1olL1olJXmsrmZYH1olL1olLeaIVXmsrmZYH1olL1olL1olJXmsrmZYH1olLna31Xmsr1olJXmsr1olJXmsrmZYH1olLqoVr1olJXmsr1olJXmsrmZYH1olL1olKkfaPobXvviGabgadXmsqThKuofKHmZ4Dobnr1olJXmsr1olJXmspXmsr1olJXmsrfZ4TuhWn1olL1olJXmsqBi7X1olJXmspZmslbmMhbmsdemsVfl8ZgmsNim8Jpk8F0m7R4m7F5nLB6jbh7jbiDirOEibOGnKaMhq+PnaCVg6qWg6qegKaff6WhnpKofKGtnomxeZy3noG6dZi+n3vCcpPDcpPGn3bLb4/Mb47UbIrVa4rYoGjdaIbeaIXhoWHmZYHobXvpcHjqdHXreHLroVrsfG/uhGnuh2bwj2Hxk17yl1vzmljzm1j0nlX1olL3AJXWAAAAbXRSTlMAEBAQHx8gICAuLjAwMDw9PUBAQEpQUFBXV1hgYGBkcHBwcXl8gICAgoiIkJCQlJicnJ2goKCmqK+wsLC4usDAwMjP0NDQ1NbW3Nzg4ODi5+3v8PDw8/T09PX29vb39/f5+fr7+/z8/Pz9/v7+zczCxgAABC5JREFUeAHN1ul3k0UUBvCb1CTVpmpaitAGSLSpSuKCLWpbTKNJFGlcSMAFF63iUmRccNG6gLbuxkXU66JAUef/9LSpmXnyLr3T5AO/rzl5zj137p136BISy44fKJXuGN/d19PUfYeO67Znqtf2KH33Id1psXoFdW30sPZ1sMvs2D060AHqws4FHeJojLZqnw53cmfvg+XR8mC0OEjuxrXEkX5ydeVJLVIlV0e10PXk5k7dYeHu7Cj1j+49uKg7uLU61tGLw1lq27ugQYlclHC4bgv7VQ+TAyj5Zc/UjsPvs1sd5cWryWObtvWT2EPa4rtnWW3JkpjggEpbOsPr7F7EyNewtpBIslA7p43HCsnwooXTEc3UmPmCNn5lrqTJxy6nRmcavGZVt/3Da2pD5NHvsOHJCrdc1G2r3DITpU7yic7w/7Rxnjc0kt5GC4djiv2Sz3Fb2iEZg41/ddsFDoyuYrIkmFehz0HR2thPgQqMyQYb2OtB0WxsZ3BeG3+wpRb1vzl2UYBog8FfGhttFKjtAclnZYrRo9ryG9uG/FZQU4AEg8ZE9LjGMzTmqKXPLnlWVnIlQQTvxJf8ip7VgjZjyVPrjw1te5otM7RmP7xm+sK2Gv9I8Gi++BRbEkR9EBw8zRUcKxwp73xkaLiqQb+kGduJTNHG72zcW9LoJgqQxpP3/Tj//c3yB0tqzaml05/+orHLksVO+95kX7/7qgJvnjlrfr2Ggsyx0eoy9uPzN5SPd86aXggOsEKW2Prz7du3VID3/tzs/sSRs2w7ovVHKtjrX2pd7ZMlTxAYfBAL9jiDwfLkq55Tm7ifhMlTGPyCAs7RFRhn47JnlcB9RM5T97ASuZXIcVNuUDIndpDbdsfrqsOppeXl5Y+XVKdjFCTh+zGaVuj0d9zy05PPK3QzBamxdwtTCrzyg/2Rvf2EstUjordGwa/kx9mSJLr8mLLtCW8HHGJc2R5hS219IiF6PnTusOqcMl57gm0Z8kanKMAQg0qSyuZfn7zItsbGyO9QlnxY0eCuD1XL2ys/MsrQhltE7Ug0uFOzufJFE2PxBo/YAx8XPPdDwWN0MrDRYIZF0mSMKCNHgaIVFoBbNoLJ7tEQDKxGF0kcLQimojCZopv0OkNOyWCCg9XMVAi7ARJzQdM2QUh0gmBozjc3Skg6dSBRqDGYSUOu66Zg+I2fNZs/M3/f/Grl/XnyF1Gw3VKCez0PN5IUfFLqvgUN4C0qNqYs5YhPL+aVZYDE4IpUk57oSFnJm4FyCqqOE0jhY2SMyLFoo56zyo6becOS5UVDdj7Vih0zp+tcMhwRpBeLyqtIjlJKAIZSbI8SGSF3k0pA3mR5tHuwPFoa7N7reoq2bqCsAk1HqCu5uvI1n6JuRXI+S1Mco54YmYTwcn6Aeic+kssXi8XpXC4V3t7/ADuTNKaQJdScAAAAAElFTkSuQmCC)](https://mybinder.org/v2/gh/dlab-berkeley/Python-Text-Analysis/HEAD) 96 | 97 | Binder operates similarly to the D-Lab DataHub, but on a different set of 98 | servers. By using Binder, however, you cannot save your work. 99 | 100 | ## Run the Code 101 | 102 | Now that you have all the required software and materials, you need to run the 103 | code. 104 | 105 | 1. Open the Anaconda Navigator application. You should see the green snake logo 106 | appear on your screen. Note that this can take a few minutes to load up the 107 | first time. 108 | 109 | 2. Click the "Launch" button under "JupyterLab" and navigate through your file 110 | system on the left hand pane to the `Python-Text-Analysis` folder you 111 | downloaded above. Note that, if you download the materials from GitHub, the 112 | folder name may instead be `Python-Text-Analysis-main`. 113 | 114 | 3. Go to the `lessons` folder and find the notebook corresponding to the 115 | workshop you are attending. 116 | 117 | 4. Press Shift + Enter (or Ctrl + Enter) to run a cell. 118 | 119 | 5. You will need to install additional packages depending on which workshop you 120 | are attending. The install commands are performed in the notebooks, as you 121 | proceed through each part of the workshop. 122 | 123 | Note that all of the above steps can be run from the terminal, if you're 124 | familiar with how to interact with Anaconda in that fashion. However, using 125 | Anaconda Navigator is the easiest way to get started if this is your first time 126 | working with Anaconda. 127 | 128 | # Additional Resources 129 | 130 | - [Computational Text Analsysis Working Group (CTAWG)](http://dlabctawg.github.io) 131 | - [Info 256: Applied Natural Language Processing](https://www.ischool.berkeley.edu/courses/info/256) 132 | - [*Speech and Language Processing*](https://web.stanford.edu/~jurafsky/slp3/) by Jurafsky and Martin. 133 | - [Modern Deep Learning Techniques Applied to Natural Language Processing](https://nlpoverview.com/index.html) (online textbook) 134 | 135 | # About the UC Berkeley D-Lab 136 | 137 | D-Lab works with Berkeley faculty, research staff, and students to advance 138 | data-intensive social science and humanities research. Our goal at D-Lab is to 139 | provide practical training, staff support, resources, and space to enable you to 140 | use R for your own research applications. Our services cater to all skill levels 141 | and no programming, statistical, or computer science backgrounds are necessary. 142 | We offer these services in the form of workshops, one-to-one consulting, and 143 | working groups that cover a variety of research topics, digital tools, and 144 | programming languages. 145 | 146 | Visit the [D-Lab homepage](https://dlab.berkeley.edu/) to learn more about us. 147 | You can view our [calendar](https://dlab.berkeley.edu/events/calendar) for 148 | upcoming events, learn about how to utilize our 149 | [consulting](https://dlab.berkeley.edu/consulting) and [data 150 | services](https://dlab.berkeley.edu/data), and check out upcoming 151 | [workshops](https://dlab.berkeley.edu/events/workshops). Subscribe to our 152 | [newsletter](https://dlab.berkeley.edu/news/weekly-newsletter) to stay up to 153 | date on D-Lab events, services, and opportunities. 154 | 155 | # Other D-Lab Python Workshops 156 | 157 | D-Lab offers a variety of Python workshops, catered toward different levels of 158 | expertise. 159 | 160 | ## Introductory Workshops 161 | 162 | - [Python Fundamentals](https://github.com/dlab-berkeley/Python-Fundamentals) 163 | - [Python Data Wrangling](https://github.com/dlab-berkeley/Python-Data-Wrangling) 164 | - [Python Data Visualization](https://github.com/dlab-berkeley/Python-Data-Visualization) 165 | 166 | ## Intermediate and Advanced Workshops 167 | 168 | - [Python Geospatial Fundamentals](https://github.com/dlab-berkeley/Geospatial-Data-and-Mapping-in-Python) 169 | - [Python Web Scraping and APIs](https://github.com/dlab-berkeley/Python-Web-Scraping) 170 | - [Python Machine Learning](https://github.com/dlab-berkeley/Python-Machine-Learning) 171 | - [Python Text Analysis](https://github.com/dlab-berkeley/Python-Text-Analysis) 172 | - [Python Deep Learning](https://github.com/dlab-berkeley/Python-Deep-Learning) 173 | 174 | # Contributors 175 | 176 | - [Pratik Sachdeva](https://github.com/pssachdeva) 177 | - [Ben Gebre-Medhin](http://gebre-medhin.com) 178 | - [Laura Nelson](http://www.lauraknelson.com) 179 | - [Teddy Roland](https://teddyroland.com/about/) 180 | - [Geoff Bacon](http://linguistics.berkeley.edu/~bacon/) 181 | - [Caroline Le Pennec-Caldichoury](https://dlab.berkeley.edu/people/caroline-le-pennec) 182 | 183 | These materials have evolved over a number of years. They were first developed 184 | by Laura Nelson and Teddy Roland, with contributions and revisions made by Ben 185 | Gebre-Medhin, Geoff Bacon, and Caroline Le Pennec-Caldichoury. They were heavily 186 | revamped by Pratik Sachdeva in 2022. 187 | 188 | -------------------------------------------------------------------------------- /data/example1.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | This is a text file that has some extra blankspace at the start and end. Blankspace is a catch-all term for spaces, tabs, newlines, and a bunch of other things that computers distinguish but to us all look like spaces, tabs and newlines. 4 | 5 | 6 | The Python method called "strip" only catches blankspace at the start and end of a string. But it won't catch it in the middle, for example, 7 | 8 | in this sentence. Once again, regular expressions will 9 | 10 | help us with this. 11 | 12 | 13 | -------------------------------------------------------------------------------- /data/example2.txt: -------------------------------------------------------------------------------- 1 | In this little example, we're going to see some of the problems that regularly appear in tokenization. Tokenization may seem simple, but it's harder than it first appears. Why is it so hard? Punctuations, contractions (like don't, won't and would've) get in the way. What do you do when you have #hashtags, @TwitterHandles, or https://urls.com? Different packages will make different decisions on when to split text apart, and when not to. 2 | -------------------------------------------------------------------------------- /data/example3.txt: -------------------------------------------------------------------------------- 1 | D-Lab helps Berkeley faculty, staff, and students move forward with world-class research in data intensive social science. We think of data as an expansive category, one that is constantly changing as the research frontier moves. We offer a venue for methodological exchange from all corners of campus and across its bounds. 2 | 3 | D-Lab provides cross-disciplinary resources for in-depth consulting and advising access to staff support and training and provisioning for software and other infrastructure needs. Networking with other Berkeley centers and facilities and with our departments and schools, we offer our services to researchers across the disciplines and underwrite the breadth of excellence of Berkeley’s graduate programs and faculty research. D-Lab builds networks that Berkeley researchers can connect with users of social science data in the off-campus world. -------------------------------------------------------------------------------- /images/bow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/Python-Text-Analysis-Legacy-2023/111c54898a03c07207a7b831ee1705ee3449a468/images/bow.png -------------------------------------------------------------------------------- /images/lda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/Python-Text-Analysis-Legacy-2023/111c54898a03c07207a7b831ee1705ee3449a468/images/lda.png -------------------------------------------------------------------------------- /images/nmf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/Python-Text-Analysis-Legacy-2023/111c54898a03c07207a7b831ee1705ee3449a468/images/nmf.png -------------------------------------------------------------------------------- /images/word2vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dlab-berkeley/Python-Text-Analysis-Legacy-2023/111c54898a03c07207a7b831ee1705ee3449a468/images/word2vec.png -------------------------------------------------------------------------------- /lessons/01_preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "d3e7ea21-6437-48e8-a9e4-3bdc05f709c9", 6 | "metadata": {}, 7 | "source": [ 8 | "# Python Text Analysis: Fundamentals, Part 1\n", 9 | "\n", 10 | "In this workshop series, we'll establish building blocks for performing text analysis in Python. These techniques lie in the domain of *natural language processing*, where we apply computational techniques to language written by humans in order to explain some of the underlying structure.\n", 11 | "\n", 12 | "So, the million dollar question: How exactly do we go about performing computational methods on words?\n", 13 | "\n", 14 | "This is ultimately a question of *representations*. Text naturally is represented as words, which are understandable to humans because we have a grammatical and syntactical structure we use to extract meaning from those words. However, most machine learning and data science techniques utilize numerical methods to extract patterns from large datasets. So, we need to find a way to convert the language into a numerical representation. We'll start with this goal in mind, and demonstrate how involved this process can be.\n", 15 | "\n", 16 | "We'll start this process by first importing text into Python. Then, we'll cover a variety of preprocessing steps you might want to use before proceeding with computational methods. In the next sequence of this workshop, we'll work with the bag-of-words, or the first numerical representation of text that we'll encounter in this workshop series." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "8c8914f4-9783-4661-9cc9-32daca53e1fd", 22 | "metadata": {}, 23 | "source": [ 24 | "## Importing Files Containing Text\n", 25 | "\n", 26 | "Text data we want to analyze will be stored in external files that need to be imported. These files will generally be text files (`.txt`) or comma separated value files (`.csv`).\n", 27 | "\n", 28 | "All the data used in this notebook are stored in a `data` folder that we need to access. We need to adjust our filepaths accordingly:" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "id": "366544be-4b56-4ed2-ba8e-aa4e41d5f95b", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "text_path = '../data/sowing_and_reaping.txt'" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "id": "b462052f-766c-434c-a262-efa7326d61c0", 44 | "metadata": {}, 45 | "source": [ 46 | "### Text Files\n", 47 | "\n", 48 | "We'll first start by importing \"Sowing and Reaping\" by Frances Harper, which is stored in a text file. Python has built-in functionality for importing text files:" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "f3dc4536-5e6c-4052-a44d-c36760714925", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "# Open and read the text\n", 59 | "with open(text_path, 'r') as file:\n", 60 | " raw_text = file.read()" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "id": "60b8e3b1-f92e-4525-a5ba-f396fb61271a", 66 | "metadata": {}, 67 | "source": [ 68 | "We've stored the text file in an object called `raw_text`. We'll remove the front and end matter for better preprocessing later:" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "7ff4469c-6a66-44bb-bd82-f20455497141", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "# Remove the front and end matter\n", 79 | "sowing_and_reaping = raw_text[1114:684814]" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "id": "414ef3f4-1c0c-4c79-918f-72175a1292d3", 85 | "metadata": {}, 86 | "source": [ 87 | "---\n", 88 | "\n", 89 | "### Challenge 1: Working with Strings\n", 90 | "\n", 91 | "* What type of object is `sowing_and_reaping`?\n", 92 | "* How many characters are in `sowing_and_reaping`?\n", 93 | "* How can we get the first 1000 characters of `sowing_and_reaping`?\n", 94 | "\n", 95 | "---" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "id": "e693ebf0-6d66-4650-b825-e27f6860a8b0", 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# YOUR CODE HERE" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "id": "d97e1de5-7cc6-4861-9ca9-0b58996d25f2", 111 | "metadata": {}, 112 | "source": [ 113 | "### Comma Separated Value (CSV) Files\n", 114 | "\n", 115 | "Often, we may have data stored in \"dataframes\" or \"tables\", which consists of many samples (rows), each containing several features (columns). Among the features is likely a text column which contains the text of interest. These dataframes are often found as Comma Separated Value (CSV) files (and somewhat less frequently as tab separated value (TSV) files). In either case, there is some \"delimiter\" (i.e., a comma or tab) which helps separate entries from each other.\n", 116 | "\n", 117 | "The `pandas` package is the best package for dealing with dataframes in Python, and this package comes with its own function for reading CSV files. For example, let's read in a file containing many Tweets about airlines:" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "id": "c75449af-a3ad-48df-a64f-1325d2ef5c9c", 124 | "metadata": {}, 125 | "outputs": [], 126 | "source": [ 127 | "# Import pandas\n", 128 | "import pandas as pd\n", 129 | "# Use pandas to import Tweets\n", 130 | "csv_path = '../data/airline_tweets.csv'\n", 131 | "tweets = pd.read_csv(csv_path, sep=',')" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "id": "bb6e3107-9f0a-414b-8267-1d5fbb09e246", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "tweets.head()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "id": "d201a3de-aaf6-4c21-97c8-5ac38517e0b5", 147 | "metadata": {}, 148 | "source": [ 149 | "Let's take a look at some of the Tweets:" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "7c32d507-ff14-4a2b-b7ae-613f3f76c69b", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "print(tweets['text'].iloc[0])\n", 160 | "print(tweets['text'].iloc[1])\n", 161 | "print(tweets['text'].iloc[2])" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "id": "3972dfc7-ddb4-486e-ac48-d5a7593f7b91", 167 | "metadata": {}, 168 | "source": [ 169 | "---\n", 170 | "\n", 171 | "### Challenge 2: Reading in Many Files\n", 172 | "\n", 173 | "The `data` folder contains another folder called `amazon`, which contains many `csv` files of Amazon reviews. Use a `for` loop to read in each dataframe. Do the following:\n", 174 | "\n", 175 | "* We've provided a path to the `amazon` folder, and a list of all the file names within the folder using the `os.listdir()` function.\n", 176 | "* Iterate over all these files, and import them using `pd.read_csv()`. You will need to use `os.path.join()` to create the correct path. Additionally, you need to provide `pandas` with the column names since they are not included in the reviews. We have create the `column_names` variable for you.\n", 177 | "* Extract the text column from each dataframe, and add then to the `reviews` list. \n", 178 | "* How many totals reviews do you obtain?\n", 179 | "\n", 180 | "---" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "id": "5d9fa08c-7e28-4c3f-ab0c-1d1ab5ad4b4a", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "# The os package has useful tools for file manipulation\n", 191 | "import os\n", 192 | "# Amazon review folder\n", 193 | "amazon_path = '../data/amazon'\n", 194 | "# List all the files in the amazon folder\n", 195 | "files = os.listdir(amazon_path)\n", 196 | "# Column names for each file\n", 197 | "column_names = ['id',\n", 198 | " 'product_id',\n", 199 | " 'user_id',\n", 200 | " 'profile_name',\n", 201 | " 'helpfulness_num',\n", 202 | " 'helpfulness_denom',\n", 203 | " 'score',\n", 204 | " 'time',\n", 205 | " 'summary',\n", 206 | " 'text']\n", 207 | "# Add each review text to this list\n", 208 | "reviews = []" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "id": "fb680f3e-b82a-411c-aff0-79d2a6dfbc86", 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "for file in files:\n", 219 | " # Check that the file is actually a CSV file\n", 220 | " if os.path.splitext(file)[1] == '.csv':\n", 221 | " # YOUR CODE HERE\n", 222 | " text = ''\n", 223 | " reviews.extend(text)" 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "id": "cce98479-c234-41bc-a2da-c90ab822b421", 229 | "metadata": {}, 230 | "source": [ 231 | "There are other file types which you may come across: `json`, `xml`, `html`, etc. There are packages you can use to import each other these. The main challenge, in most cases, is dealing with multiple files, and extracting the actual text you want." 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "id": "89e88f2b-bdaa-4c28-ad13-4e790ccb6827", 237 | "metadata": {}, 238 | "source": [ 239 | "## Preprocessing\n", 240 | "\n", 241 | "Our goal is to convert a text representation to a numerical representation. However, language can be messy. There's a variety of preprocessing steps that we'd like to do before we get to the numerical representation.\n", 242 | "\n", 243 | "We will largely be using a package called Natural Language Toolkit, or `nltk`, to perform these operations. In some cases, we'll use basic Python.\n", 244 | "\n", 245 | "There are a host of natural language processing packages one can use. For example, one newer package is `spaCy`, which is extremely powerful. Our goal here is to not make you an expert in a variety of NLP packages, but to expose you to principles that are shared by all of them. In this way, you'll be better prepared to open up any new NLP package you might have to use." 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "id": "c7fd3c0c-844b-4297-9edb-32bb441af9ec", 251 | "metadata": {}, 252 | "source": [ 253 | "### Installing `nltk`\n", 254 | "\n", 255 | "If this is your first time using `nltk`, we'll go through a couple steps to get set up. First, install `nltk` if you have not already:" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "id": "23185480-b884-4c26-95f0-e9e647280b77", 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "# Run if you do not have nltk installed\n", 266 | "%pip install nltk" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "id": "d3c7f258-c3b5-45a0-9de3-f0a9382bce8b", 272 | "metadata": {}, 273 | "source": [ 274 | "Next, we need to install a couple packages within `nltk`:" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "id": "ff00fb9e-708b-43c8-9ec7-767228f34980", 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "import nltk\n", 285 | "nltk.download('wordnet')\n", 286 | "nltk.download('stopwords')" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "id": "3915028e-7ab1-4851-80f1-185da224cc52", 292 | "metadata": {}, 293 | "source": [ 294 | "### Text Cleaning\n", 295 | "\n", 296 | "\"Text cleaning\" is a catch-all term for the process of performing relatively simple tasks in order to normalize our code. Text cleaning can mean a variety of different things depending on your use case." 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "id": "aba11eee-c926-4591-bb32-d268646309ac", 302 | "metadata": { 303 | "tags": [] 304 | }, 305 | "source": [ 306 | "#### A Brief Introduction to Regular Expressions\n", 307 | "\n", 308 | "Before we dive into the specific text cleaning processes, let's briefly introduce regular expressions. We do this here since many text cleaning steps may require regular expressions, and many NLP libraries heavily use them under the hood.\n", 309 | "\n", 310 | "Regular expressions (regexes) are a powerful way of searching for specific string patterns in large corpora. They have an infamously steep learning curve, but are very efficient when you get a handle on them.\n", 311 | "\n", 312 | "Our goal in this workshop is not to provide a deep (or even shallow) dive into regexes; instead, we want to expose you to them so that you're better prepared to do deep dives in the future.\n", 313 | "\n", 314 | "Regex testers are a useful tool in both understanding and creating regex expression. An example is this [website](https://regex101.com)." 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "id": "cf750b5f-c08b-4f35-9eb8-b2182f892b97", 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "import re\n", 325 | "pattern = 'test'" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "id": "dd9690d3-c12f-4a2c-bcc7-cfa3f3523949", 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "test_string = 'This is a test.'\n", 336 | "# Find tokens\n", 337 | "tokens = re.findall(pattern=pattern, string=test_string)\n", 338 | "print(tokens)\n", 339 | "# Replace tokens\n", 340 | "replaced = re.sub(pattern=pattern, repl='not a test', string=test_string) \n", 341 | "print(replaced)" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "id": "d336ea3a-dd51-48a1-bf58-095298afdeb4", 347 | "metadata": {}, 348 | "source": [ 349 | "This is nice, but we could have done this somewhat easily with basic Python `string` functions. Let's try something more interesting:" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "id": "244236dc-60b9-490c-98b7-33a6af5cb2b9", 356 | "metadata": {}, 357 | "outputs": [], 358 | "source": [ 359 | "# Word pattern matcher\n", 360 | "pattern = r'\\w+'\n", 361 | "re.findall(pattern, test_string)" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "id": "3e60923f-b981-4ad9-8777-0951105736d8", 367 | "metadata": {}, 368 | "source": [ 369 | "What did this do? Use the regex website to confirm your guess!\n", 370 | "\n", 371 | "For now, we won't go much further than this, but there are many resources online to continue learning about regexes." 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "id": "91ebae34-7b23-49a9-8aad-8f49cb55f320", 377 | "metadata": {}, 378 | "source": [ 379 | "#### Lowercasing\n", 380 | "\n", 381 | "While there is often information in the \"casing\" of words (e.g., whether text is lowercase or uppercase), we often don't work in a regime where we're able to properly leverage this information. So, a common text cleaning step is to lowercase all text, in order to simplify our analysis.\n", 382 | "\n", 383 | "We can easily do this with the built-in string function `lower()`:" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "id": "1a07353a-0dfc-4438-ade1-d2fee7a3305f", 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "sowing_and_reaping_lower = sowing_and_reaping.lower()" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "id": "4e2cab72-6002-4861-af38-d0b67296eb62", 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "print(sowing_and_reaping[:200])\n", 404 | "print('------')\n", 405 | "print(sowing_and_reaping_lower[:200])" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "id": "39654022-c93b-4bf5-9a63-efaf2b8125c3", 411 | "metadata": {}, 412 | "source": [ 413 | "#### Removing Punctuation\n", 414 | "\n", 415 | "Sometimes, you might want to keep only the alphanumeric characters (i.e. the letters and numbers) and ditch the punctuation. This becomes less common when we consider more advanced NLP algorithms. In many cases, you may do this step *after* tokenization, which we will discuss in the next section. " 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "id": "32e11d9c-8440-479e-86a0-13c7e1e42d37", 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "from string import punctuation\n", 426 | "print(punctuation)" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "id": "66e1da6f-0c1b-4588-a310-028c18fcad9b", 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "punctuation_text = \"We've got quite a bit of punctuation here, don't we?!? #Python @D-Lab.\"\n", 437 | "no_punctuation = ''.join([char for char in punctuation_text if char not in punctuation])\n", 438 | "print(no_punctuation)" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "id": "3a2f9bba-e2a5-4839-b017-f979a09752a1", 444 | "metadata": {}, 445 | "source": [ 446 | "#### Stripping Blank Spaces\n", 447 | "\n", 448 | "Removing blank space is a common step, as we might come across text with extraneous blank space. This is particularly common when text is imported from messy places, like webpages.\n", 449 | "\n", 450 | "Python has a built-in function to deal with blank space on the *ends* of strings:" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "id": "71d42f85-71fd-4fb3-9c29-c24c41188fea", 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "string = ' Hello! '\n", 461 | "string.strip()" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "id": "17f3d2ef-3111-43f4-b77d-08d9363a0756", 467 | "metadata": {}, 468 | "source": [ 469 | "What about within text? We will need to use a regular expression for this:" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": null, 475 | "id": "cca33c78-f8bf-438f-9737-f0a89c30c470", 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "example1_path = '../data/example1.txt'\n", 480 | "\n", 481 | "with open(example1_path, 'r') as file:\n", 482 | " example1 = file.read()\n", 483 | " \n", 484 | "print(example1)" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "id": "53e94cb2-49c1-4be2-8496-6dd124d90188", 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "# Stripping only removes the ends\n", 495 | "print(example1.strip())" 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": null, 501 | "id": "083b1553-dc87-4150-b857-76c2f0df8207", 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "# A regular expression will handle blank spaces within the text\n", 506 | "blankspace_pattern = r'\\s+'\n", 507 | "blankspace_repl = ' '\n", 508 | "clean_text = re.sub(blankspace_pattern, blankspace_repl, example1)\n", 509 | "clean_text.strip()" 510 | ] 511 | }, 512 | { 513 | "cell_type": "markdown", 514 | "id": "5e2a168e-e2e0-46c0-b794-fa0ecae400dc", 515 | "metadata": {}, 516 | "source": [ 517 | "#### Removing URLs, Hashtags, and Numbers\n", 518 | "\n", 519 | "Text containing non-alphabetic symbols may have additional meaning beyond simply using punctuation or numbers. For example, text may contain URLs, hashtags, or numbers. Each of these are informative in their own right.\n", 520 | "\n", 521 | "However, we rarely care about the exact URL used in a tweet. Similarly, we might not care about specific hashtags, or the precise number used. While, we could remove them completely, it's often informative to know that there *exists* a URL, hashtag, or number.\n", 522 | "\n", 523 | "So, we replace individual URLs, hashtags, and numbers with a \"symbol\" that preserves the fact these structures exist in the text. It's standard to just use the strings \"URL\", \"HASHTAG\", and \"DIGIT\".\n", 524 | "\n", 525 | "Since these types of text often contain precise structure, they're an apt case for using regular expressions. Let's apply these patterns to the Tweets above." 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "id": "f8069337-4a39-4437-8710-f9b5c58335f2", 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "# Get a Tweet with a URL in it\n", 536 | "url_tweet = tweets['text'].iloc[13]\n", 537 | "print(url_tweet)" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "id": "fc1fa10d-e546-4e61-89e5-adf53c7e6262", 544 | "metadata": {}, 545 | "outputs": [], 546 | "source": [ 547 | "# URL \n", 548 | "url_pattern = r'(http|ftp|https):\\/\\/([\\w_-]+(?:(?:\\.[\\w_-]+)+))([\\w.,@?^=%&:\\/~+#-]*[\\w@?^=%&\\/~+#-])'\n", 549 | "url_repl = ' URL '\n", 550 | "re.sub(url_pattern, url_repl, url_tweet)" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": null, 556 | "id": "40416f9e-4052-425f-9a00-94a3b0a94e8c", 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "# Hashtag\n", 561 | "hashtag_pattern = r'(?:^|\\s)[##]{1}(\\w+)'\n", 562 | "hashtag_repl = ' HASHTAG '\n", 563 | "re.sub(hashtag_pattern, hashtag_repl, url_tweet)" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "id": "fba96052-a24e-46ca-8d99-60e47b485c0a", 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "# Digits\n", 574 | "digit_tweet = tweets['text'].iloc[32]\n", 575 | "print(digit_tweet)\n", 576 | "digit_pattern = '\\d+'\n", 577 | "digit_repl = ' DIGIT '\n", 578 | "re.sub(digit_pattern, digit_repl, digit_tweet)" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "id": "8a288b9a-a33a-48be-bac4-93ffd39318ec", 584 | "metadata": {}, 585 | "source": [ 586 | "What other kinds of text strings can you think of that we might want to replace?\n", 587 | "\n", 588 | "Natural language is complex, and so there may be use cases where we might need specialized packages for preprocessing or removing text. For example, the [`emoji` package](https://pypi.org/project/emoji/) may be useful for social media text. The [`textacy` package](https://textacy.readthedocs.io/en/latest/) also provides useful preprocessing tools." 589 | ] 590 | }, 591 | { 592 | "cell_type": "markdown", 593 | "id": "92f5e64f-c6a0-447b-973e-a48e9eb80ade", 594 | "metadata": {}, 595 | "source": [ 596 | "---\n", 597 | "\n", 598 | "### Challenge 3: Text Cleaning with Multiple Steps\n", 599 | "\n", 600 | "In Challenge 1, we imported many Amazon reviews, and stored them in a variable called `reviews`. Each element of the list is a string, representing the text of a single review. For each review:\n", 601 | "\n", 602 | "* Replace any URLs and digits.\n", 603 | "* Make all characters lower case.\n", 604 | "* Strip all blankspace.\n", 605 | "\n", 606 | "Keep in mind: the order in which you do these steps matters!\n", 607 | "\n", 608 | "---" 609 | ] 610 | }, 611 | { 612 | "cell_type": "code", 613 | "execution_count": null, 614 | "id": "2d599284-2392-488c-a4a8-dad5eeb50f93", 615 | "metadata": {}, 616 | "outputs": [], 617 | "source": [ 618 | "def preprocess(text):\n", 619 | " # YOUR CODE HERE\n", 620 | " return text" 621 | ] 622 | }, 623 | { 624 | "cell_type": "markdown", 625 | "id": "bacc7c15-1bc7-47cd-b668-ab57634c8b04", 626 | "metadata": {}, 627 | "source": [ 628 | "## Tokenization\n", 629 | "\n", 630 | "One of the most important steps in text analysis is tokenization. This is the process of breaking down the text into \"tokens\", which are distinct chunks that we recognize as unique in whatever corpus we're working in.\n", 631 | "\n", 632 | "Let's start by importing an example file:" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "id": "3cd9d972-ae66-4415-8ce6-359cb9db0329", 639 | "metadata": {}, 640 | "outputs": [], 641 | "source": [ 642 | "example2_path = '../data/example2.txt'\n", 643 | "\n", 644 | "with open(example2_path) as file:\n", 645 | " example2 = file.read()\n", 646 | " \n", 647 | "print(example2)" 648 | ] 649 | }, 650 | { 651 | "cell_type": "markdown", 652 | "id": "6b402ed3-de7f-4e16-ade6-444c9e74f8eb", 653 | "metadata": {}, 654 | "source": [ 655 | "Let's try naively tokenizing by splitting up the text according to blankspace, using a basic Python string method:" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "id": "99fabb62-ff4e-48f7-b057-f72983dcf87e", 662 | "metadata": {}, 663 | "outputs": [], 664 | "source": [ 665 | "tokens = example2.split()\n", 666 | "# Print first ten tokens\n", 667 | "tokens[:20]" 668 | ] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "id": "60aa6b33-e927-4adc-8d6b-9a5a1f310e62", 673 | "metadata": {}, 674 | "source": [ 675 | "We can roughly think of this as \"word tokenization\". However, it's not always clear that simply splitting up by spaces will get what we want. Consider contractions, for example, which really consist of two words connected together. More advanced tokenizations will actually treat these words differently." 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "id": "6263bf1f-b033-49c6-81b7-794d0e5bf1cb", 681 | "metadata": {}, 682 | "source": [ 683 | "`nltk` has a function called `word_tokenize` which can tokenize a string for us in an intelligent fashion. Ultimately, `nltk` basically is a bunch of regular expressions under the hood:" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "id": "06c63c1d-e1b3-4f21-aa57-460771e60801", 690 | "metadata": {}, 691 | "outputs": [], 692 | "source": [ 693 | "from nltk.tokenize import word_tokenize\n", 694 | "nltk_tokens = word_tokenize(example2)" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "id": "9f8769c1-6446-465b-a9b3-06e26086a330", 701 | "metadata": {}, 702 | "outputs": [], 703 | "source": [ 704 | "print(nltk_tokens)" 705 | ] 706 | }, 707 | { 708 | "cell_type": "markdown", 709 | "id": "9022de3b-ab72-4616-bba2-831f58491757", 710 | "metadata": {}, 711 | "source": [ 712 | "Looking at this example, you can see how `nltk` has made certain decisions about where and when to tokenize. Tokenization is critical for downstream processing, and there's a variety of methods for performing the tokenizing. Let's take a look at `spaCy`'s tokenizer." 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": null, 718 | "id": "12cdd905-96ef-42f4-852f-a39ceede2a9a", 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [ 722 | "# Install spaCy if necessary\n", 723 | "%pip install spacy" 724 | ] 725 | }, 726 | { 727 | "cell_type": "code", 728 | "execution_count": null, 729 | "id": "59e4dbd6-46fa-477b-af86-63913670123e", 730 | "metadata": {}, 731 | "outputs": [], 732 | "source": [ 733 | "!python -m spacy download en_core_web_sm" 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": null, 739 | "id": "07f98810-7494-48bf-9d01-0c9df9bfc562", 740 | "metadata": {}, 741 | "outputs": [], 742 | "source": [ 743 | "# Import spaCy and load the dictionary\n", 744 | "import spacy\n", 745 | "nlp = spacy.load(\"en_core_web_sm\")\n", 746 | "# Pass the example into the English pipeline\n", 747 | "doc = nlp(example2)\n", 748 | "spacy_tokens = [token.text for token in doc]" 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": null, 754 | "id": "e8acb52a-e49b-4fdd-b09c-2e6c4fd5d7d1", 755 | "metadata": {}, 756 | "outputs": [], 757 | "source": [ 758 | "# Compare NLTK to spaCy\n", 759 | "print(nltk_tokens)\n", 760 | "print(spacy_tokens)" 761 | ] 762 | }, 763 | { 764 | "cell_type": "markdown", 765 | "id": "b8e2147b-841e-4a1f-97f1-fd64f23eb118", 766 | "metadata": {}, 767 | "source": [ 768 | "---\n", 769 | "\n", 770 | "### Challenge 4: Tokenizing a Large Text\n", 771 | "\n", 772 | "Tokenize \"Sowing and Reaping\", which we imported at the beginning of this workshop. Use a method of your choice.\n", 773 | "\n", 774 | "Once you've tokenized, find all the unique words types (you might want the `set` function). Then, sort the resulting `set` object to create a vocabulary (you might want to use the `sorted` function).\n", 775 | "\n", 776 | "---" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": null, 782 | "id": "b1678051-ecc8-4210-9547-4884598a048a", 783 | "metadata": {}, 784 | "outputs": [], 785 | "source": [ 786 | "# YOUR CODE HERE" 787 | ] 788 | }, 789 | { 790 | "cell_type": "markdown", 791 | "id": "61259cad-3b35-4ce9-89b3-77ba8283bc18", 792 | "metadata": {}, 793 | "source": [ 794 | "## Removing Stop Words\n", 795 | "\n", 796 | "Text often has words that are very common and usually not informative. These words tend to be pronouns or articles, such as \"the\", \"a\", \"it\", \"them\", etc. In many cases, these \"stop words\" are those that we may wish to remove before performing computation since they usually are not very informative. \n", 797 | "\n", 798 | "In practice, this is simple to do - we just filter out tokens by words. However, we may want to use different \"stop word lists\", depending on our use case. For example, `nltk` has a stop word list:" 799 | ] 800 | }, 801 | { 802 | "cell_type": "code", 803 | "execution_count": null, 804 | "id": "7989a296-4f9c-44eb-b0f1-c554194fc7b2", 805 | "metadata": {}, 806 | "outputs": [], 807 | "source": [ 808 | "from nltk.corpus import stopwords\n", 809 | "stop = stopwords.words('english')" 810 | ] 811 | }, 812 | { 813 | "cell_type": "code", 814 | "execution_count": null, 815 | "id": "6bf9ff5f-73c8-446c-be8e-dabe1bd6d8a2", 816 | "metadata": {}, 817 | "outputs": [], 818 | "source": [ 819 | "# What kinds of words are in the list?\n", 820 | "print(stop[:50])" 821 | ] 822 | }, 823 | { 824 | "cell_type": "code", 825 | "execution_count": null, 826 | "id": "c49a372c-ecef-412f-bdf3-34f76f8b3f78", 827 | "metadata": {}, 828 | "outputs": [], 829 | "source": [ 830 | "# Remove tokens that are stop words\n", 831 | "tokens = [token for token in tokens if token not in stop]\n", 832 | "print(tokens)" 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": null, 838 | "id": "95d66cdf-89a4-49b6-812d-7215f97272d8", 839 | "metadata": {}, 840 | "outputs": [], 841 | "source": [ 842 | "# Compare to the original text\n", 843 | "print(example2)" 844 | ] 845 | }, 846 | { 847 | "cell_type": "markdown", 848 | "id": "bd7692ad-8545-435f-b0cd-80f13ffff209", 849 | "metadata": {}, 850 | "source": [ 851 | "## Stemming and Lemmatization\n", 852 | "\n", 853 | "Stemming and lemmatization both refer to removing morphological affixes on words. Many words consist of a \"core\" word with a modified ending that adjusts the word's meaning in a given context. For example, the word \"grows\" is simply \"grow\" with an \"s\" added to denote a change in verb tense. In many cases, we're interested in the core content of the word. Stemming and lemmatization are the process of getting at the \"core\" of a word. This \"core\" component is often referred to as the *lemma*.\n", 854 | "\n", 855 | "Stemming is a rudimentary approach to obtaining the lemma: it simply removes an ending of a word. So, \"grows\" would be stemmed to \"grow\". The word \"running\" would be stemmed to \"run\".\n", 856 | "\n", 857 | "Lemmatization is more general: it aims to find the lemma of a word, but can handle cases where stemming may not work. For example, the word \"fairies\" cannot be stemmed to the lemma, \"fairy\". So, we need additional rules - provided by lemmatization - that can appropriately turn \"fairies\" into \"fairy\".\n", 858 | "\n", 859 | "`nltk` provides many algorithms for stemming. We'll use the Snowball Stemmer, which we'll import from `nltk`. We'll also look at the Word Net Lemmatizer:" 860 | ] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": null, 865 | "id": "174b304d-5d5b-4303-9262-c43f6c6948e7", 866 | "metadata": {}, 867 | "outputs": [], 868 | "source": [ 869 | "from nltk.stem import SnowballStemmer, WordNetLemmatizer" 870 | ] 871 | }, 872 | { 873 | "cell_type": "code", 874 | "execution_count": null, 875 | "id": "6e9e5f2e-2384-4631-b260-8fbe1e6970cf", 876 | "metadata": {}, 877 | "outputs": [], 878 | "source": [ 879 | "# Instantiate the stemmer and lemmatizer\n", 880 | "stemmer = SnowballStemmer('english')\n", 881 | "lemmatizer = WordNetLemmatizer()" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": null, 887 | "id": "dac5719f-83d2-49ef-b1a6-3c9cfe0a7ec2", 888 | "metadata": {}, 889 | "outputs": [], 890 | "source": [ 891 | "# Stemming examples\n", 892 | "print(stemmer.stem('grows'))\n", 893 | "print(stemmer.stem('running'))\n", 894 | "print(stemmer.stem('coded'))" 895 | ] 896 | }, 897 | { 898 | "cell_type": "code", 899 | "execution_count": null, 900 | "id": "35604e03-b724-4896-85be-7a664d18c27a", 901 | "metadata": {}, 902 | "outputs": [], 903 | "source": [ 904 | "# When does stemming not quite work?\n", 905 | "print(stemmer.stem('fairies'))\n", 906 | "print(stemmer.stem('wolves'))\n", 907 | "print(stemmer.stem('abaci'))\n", 908 | "print(stemmer.stem('leaves'))\n", 909 | "print(stemmer.stem('carried'))" 910 | ] 911 | }, 912 | { 913 | "cell_type": "code", 914 | "execution_count": null, 915 | "id": "7d46f1f8-e6aa-48db-a954-508fdf4f542a", 916 | "metadata": {}, 917 | "outputs": [], 918 | "source": [ 919 | "# Let's try lemmatizing these, instead:\n", 920 | "print(lemmatizer.lemmatize('fairies'))\n", 921 | "print(lemmatizer.lemmatize('wolves'))\n", 922 | "print(lemmatizer.lemmatize('abaci'))\n", 923 | "print(lemmatizer.lemmatize('leaves'))\n", 924 | "print(lemmatizer.lemmatize('carried'))" 925 | ] 926 | }, 927 | { 928 | "cell_type": "markdown", 929 | "id": "044da5ed-cf6a-4114-b4b7-102e8504ce42", 930 | "metadata": {}, 931 | "source": [ 932 | "What happened with that last one? Sometimes we need to provide the lemmatizer a 'part-of-speech' tag to help resolve ambiguous cases. This is another argument in the lemmatizer:" 933 | ] 934 | }, 935 | { 936 | "cell_type": "code", 937 | "execution_count": null, 938 | "id": "50976d9b-5151-4dc1-862d-6cb2576c8dfb", 939 | "metadata": {}, 940 | "outputs": [], 941 | "source": [ 942 | "print(lemmatizer.lemmatize('carried', pos='v'))" 943 | ] 944 | }, 945 | { 946 | "cell_type": "markdown", 947 | "id": "27d6c43f-33e9-4f48-8812-f77e31a85111", 948 | "metadata": {}, 949 | "source": [ 950 | "Try it with \"leaves\", which has more than one way to lemmatize!" 951 | ] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": null, 956 | "id": "961a0ed8-d653-41c6-932d-90dba75329d0", 957 | "metadata": {}, 958 | "outputs": [], 959 | "source": [ 960 | "print(lemmatizer.lemmatize('leaves', pos='n'))\n", 961 | "print(lemmatizer.lemmatize('leaves', pos='v'))" 962 | ] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "id": "6e2a4cf0-cfd9-4116-94ef-58ef4426b482", 967 | "metadata": {}, 968 | "source": [ 969 | "---\n", 970 | "\n", 971 | "### Challenge 5: Apply a Lemmatizer to Text\n", 972 | "\n", 973 | "Lemmatize the tokenized `example2` text using the `nltk`'s `WordNetLemmatizer`.\n", 974 | "\n", 975 | "---" 976 | ] 977 | }, 978 | { 979 | "cell_type": "code", 980 | "execution_count": null, 981 | "id": "dc687676-4298-4a88-9655-2e77b72b4192", 982 | "metadata": {}, 983 | "outputs": [], 984 | "source": [ 985 | "# YOUR CODE HERE" 986 | ] 987 | }, 988 | { 989 | "cell_type": "markdown", 990 | "id": "a429b117-1983-45bf-bdff-d6f0f83a8c22", 991 | "metadata": {}, 992 | "source": [ 993 | "---\n", 994 | "\n", 995 | "### Challenge 6: Putting it All Together\n", 996 | "\n", 997 | "Write a function called `preprocess()` that accepts a string and performs the following preprocessing steps:\n", 998 | "\n", 999 | "* Lowercase text.\n", 1000 | "* Replace all URLs and numbers with their respective tokens.\n", 1001 | "* Strip blankspace.\n", 1002 | "* Tokenize.\n", 1003 | "* Remove punctuation.\n", 1004 | "* Remove stop words.\n", 1005 | "* Lemmatize the tokens.\n", 1006 | "\n", 1007 | "Apply this function to `sowing_and_reaping`.\n", 1008 | "\n", 1009 | "---" 1010 | ] 1011 | }, 1012 | { 1013 | "cell_type": "code", 1014 | "execution_count": null, 1015 | "id": "4f5dbfb0-97b1-4abf-845c-17a0d4942831", 1016 | "metadata": {}, 1017 | "outputs": [], 1018 | "source": [ 1019 | "def preprocess(text):\n", 1020 | " # YOUR CODE HERE\n", 1021 | " return text" 1022 | ] 1023 | }, 1024 | { 1025 | "cell_type": "code", 1026 | "execution_count": null, 1027 | "id": "8d259147-ad68-417c-99a6-1fecfb9b9a5f", 1028 | "metadata": {}, 1029 | "outputs": [], 1030 | "source": [ 1031 | "preprocess(sowing_and_reaping)" 1032 | ] 1033 | }, 1034 | { 1035 | "cell_type": "markdown", 1036 | "id": "ca49653f-8700-4681-a50a-43bbb14c124d", 1037 | "metadata": {}, 1038 | "source": [ 1039 | "## Powerful Features of `spaCy`\n", 1040 | "\n", 1041 | "We will end this portion of the workshop by examining some of the more powerful features offered by the newer NLP library, `spaCy`. Beside being quite fast, `spaCy` provides very powerful built-in tools in its tokenizer. For example, we automatically get many of the above operations in one fell swoop:" 1042 | ] 1043 | }, 1044 | { 1045 | "cell_type": "code", 1046 | "execution_count": null, 1047 | "id": "f741be70-2e03-4b95-b1a0-d5a525da9ba1", 1048 | "metadata": {}, 1049 | "outputs": [], 1050 | "source": [ 1051 | "short_example = \"We're learning about natural language processing at Berkeley.\"\n", 1052 | "doc = nlp(short_example)\n", 1053 | "\n", 1054 | "for token in doc:\n", 1055 | " print(\n", 1056 | " f\"Token: {token.text}; Lemma: {token.lemma_}; Part-of-speech: {token.pos_}; \"\n", 1057 | " f\"Token shape: {token.shape_}; Alphabetical? {token.is_alpha}; Stop Word? {token.is_stop}\"\n", 1058 | " )" 1059 | ] 1060 | }, 1061 | { 1062 | "cell_type": "markdown", 1063 | "id": "bcec4cf7-eb3c-4b4a-9008-bb2c113634c8", 1064 | "metadata": {}, 1065 | "source": [ 1066 | "Tokenizing, lemmatization, part of speech tagging, stop word detection, and a couple other things are provided to us up front when we pass a text into the `nlp` module." 1067 | ] 1068 | }, 1069 | { 1070 | "cell_type": "markdown", 1071 | "id": "d5c70884-da79-45e8-a3b1-905f57241817", 1072 | "metadata": {}, 1073 | "source": [ 1074 | "`spaCy` also comes with some pretty shiny visualization tools:" 1075 | ] 1076 | }, 1077 | { 1078 | "cell_type": "code", 1079 | "execution_count": null, 1080 | "id": "ab7351a4-b05f-4938-894f-9a21309b3473", 1081 | "metadata": {}, 1082 | "outputs": [], 1083 | "source": [ 1084 | "from spacy import displacy\n", 1085 | "displacy.render(doc, style=\"dep\", options={'compact': True})" 1086 | ] 1087 | }, 1088 | { 1089 | "cell_type": "markdown", 1090 | "id": "28f9e25d-7f90-4656-804e-351af0cf0356", 1091 | "metadata": {}, 1092 | "source": [ 1093 | "For longer texts, we also get the ability to perform a variety of other operations very easily. Here are some cases:" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "code", 1098 | "execution_count": null, 1099 | "id": "65eee367-1ade-4b95-b5bb-21968a9050b5", 1100 | "metadata": {}, 1101 | "outputs": [], 1102 | "source": [ 1103 | "example3_path = '../data/example3.txt'\n", 1104 | "\n", 1105 | "with open(example3_path, 'r') as file:\n", 1106 | " example3 = file.read()\n", 1107 | " \n", 1108 | "doc = nlp(example3)" 1109 | ] 1110 | }, 1111 | { 1112 | "cell_type": "code", 1113 | "execution_count": null, 1114 | "id": "64d59d76-d30c-44fb-b862-c208c3212e81", 1115 | "metadata": {}, 1116 | "outputs": [], 1117 | "source": [ 1118 | "print(example3)" 1119 | ] 1120 | }, 1121 | { 1122 | "cell_type": "code", 1123 | "execution_count": null, 1124 | "id": "45ce243b-60b8-4e28-a69b-858e9df3bd55", 1125 | "metadata": {}, 1126 | "outputs": [], 1127 | "source": [ 1128 | "# Sentence segmentation\n", 1129 | "print('Sentence Segmentation')\n", 1130 | "for sentence in doc.sents:\n", 1131 | " print(sentence)\n", 1132 | "\n", 1133 | "# Entity detection\n", 1134 | "print('\\nEntity Detection:')\n", 1135 | "for entity in doc.ents:\n", 1136 | " print(entity.text, entity.label_)\n", 1137 | "\n", 1138 | "# Noun chunks\n", 1139 | "print('\\nNoun Chunks:')\n", 1140 | "for chunk in doc.noun_chunks:\n", 1141 | " print(chunk)" 1142 | ] 1143 | }, 1144 | { 1145 | "cell_type": "markdown", 1146 | "id": "b2f379d0-edf6-4282-9075-cc168b7444a3", 1147 | "metadata": {}, 1148 | "source": [ 1149 | "There's a whole lot else we can do with it! Check out `spaCy`'s documentation to see more." 1150 | ] 1151 | } 1152 | ], 1153 | "metadata": { 1154 | "kernelspec": { 1155 | "display_name": "Python 3 (ipykernel)", 1156 | "language": "python", 1157 | "name": "python3" 1158 | }, 1159 | "language_info": { 1160 | "codemirror_mode": { 1161 | "name": "ipython", 1162 | "version": 3 1163 | }, 1164 | "file_extension": ".py", 1165 | "mimetype": "text/x-python", 1166 | "name": "python", 1167 | "nbconvert_exporter": "python", 1168 | "pygments_lexer": "ipython3", 1169 | "version": "3.8.12" 1170 | } 1171 | }, 1172 | "nbformat": 4, 1173 | "nbformat_minor": 5 1174 | } 1175 | -------------------------------------------------------------------------------- /lessons/03_word_embeddings.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "2c0c2f15-1a64-4b2b-8ca1-270f05aa24b9", 6 | "metadata": {}, 7 | "source": [ 8 | "# Python Text Analysis: Word Embeddings" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "edf6a718-71e0-49d9-a1ed-7b0ab99d4e81", 14 | "metadata": {}, 15 | "source": [ 16 | "Thus far, we've focused on bag-of-word approaches to text analysis, where the text is represented as a vector of word frequencies. This generally works pretty well - we can do a decent job of supervised classification with this approach. However, word frequencies alone don't tell the whole picture. The ordering of words, for example, provides additional context that word frequencies don't capture. Furthermore, words can be used in a variety of ways, with different meanings that get lost in a word frequency representation.\n", 17 | "\n", 18 | "An alternative formalization of text consists of representing the words (or bi-grams, phrases, etc.) as vectors. They're also called word embeddings, because we embed the word in a higher dimensional space. A word vector has no inherent meaning to humans - ultimately, it's just a bunch of floating point numbers. But word vectors are useful because they're a numerical representation of text that captures its semantic meaning, and can easily be used in downstream tasks, such as dictionary methods, classification, topic modeling etc. Furthermore, the vector representation can be used to perform semantic tasks, such as finding synonyms, testing analogies, and others. The big question, however, is: how do we create the word vector in the first place?\n", 19 | "\n", 20 | "The answer is to pick the right task. Specifically, we're going to calculate the word vectors so that they can be successfully used in one of two tasks: predicting surrounding words, or predicting words within a context." 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "id": "94fc6996", 26 | "metadata": {}, 27 | "source": [ 28 | "# The Word Embedding Model: `word2vec`" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "ef6396f1-6baa-49cb-8a4a-3679732f315e", 34 | "metadata": {}, 35 | "source": [ 36 | "The word embedding model, generally referred to as `word2vec`, was developed by [Mikolov et al.](https://arxiv.org/abs/1310.4546) in 2013. The basic premise is to find vector representations of tokens that have semantic meaning. How do we go about learning a \"good\" vector representation from the data?\n", 37 | "\n", 38 | "Mikolov et al. proposed two approaches: the **skip-gram (SG)** and the **continuous bag-of-words (CBOW)**. Both approaches are similar in that we use the vector representation of a token to try and predict what the nearby tokens are with a shallow neural network.\n", 39 | "\n", 40 | "![word2vec](../images/word2vec.png)\n", 41 | "\n", 42 | "In the continuous bag-of-words model, our goal is to predict a word $w(t)$, given the words that surround it - e.g., $w(t-2), w(t-1), w(t+1), w(t+2)$, etc. So, in an example text such as `I went to the store to get some apples`, we may try to use the word vectors for `I`, `went`, `to`, `the`, `to`, `get`, `some`, `apples` to predict the word `store`. This would correspond to a *window size* of 4: 4 words on either side of the target word.\n", 43 | "\n", 44 | "In the skip-gram model, we construct a word vector that can be used to predict the words surrounding a specific word $w(t)$. This is the reverse of the continuous bag-of-words, and is a harder task, since we have to predict more from less information. In the above example, we'd aim to predict the remaining words in the sentence from the word vector for `store`. \n", 45 | "\n", 46 | "You can use either approaches to build a set of word embeddings. Mikolov et al. demonstrated that the skip-gram works pretty well in larger corpuses. Furthermore, it's easier to train the skip-gram efficiently, making it faster.\n", 47 | "\n", 48 | "The mechanics of how the training is actually done revolves around a **shallow neural network**. An **objective function** is specified - a mathematical expression that quantifies how well we predicted a word - which allows the values of the word vectors to be optimized using **back propagation**. We won't go into these details for this workshop, but check out the Python Deep Learning workshop if you'd like to learn more about neural networks!\n", 49 | "\n", 50 | "Let's jump into it!" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "id": "3aaa0a94", 56 | "metadata": {}, 57 | "source": [ 58 | "# Installing `gensim`" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "c3b7b49d-86ab-45d3-9e63-42926c134b30", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "import numpy as np\n", 69 | "import pandas as pd\n", 70 | "import re\n", 71 | "\n", 72 | "from sklearn.model_selection import train_test_split\n", 73 | "from sklearn.linear_model import LogisticRegressionCV" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "id": "6ff20430", 79 | "metadata": {}, 80 | "source": [ 81 | "We'll be using a package called `gensim` to conduct our word embedding experiments. `gensim` is one of the major Python packages for natural language processing, largely aimed at using different kinds of embeddings.\n", 82 | "\n", 83 | "If you don't have `gensim` installed, you can install it directly within this notebook:" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "id": "94d66811", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "# Run if you do not have gensim installed\n", 94 | "!pip install gensim" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "id": "4b84a258", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "import gensim\n", 105 | "import gensim.downloader as api" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "id": "67f162cd-e284-4081-bf8f-e052e492e9e8", 111 | "metadata": {}, 112 | "source": [ 113 | "# Using Pre-trained Word Embeddings" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "id": "e01cd2bc", 119 | "metadata": {}, 120 | "source": [ 121 | "The first thing we'll do is use a pre-trained word embedding. This means that we're downloading a word embedding model that has already been trained on a large corpus. Researchers have trained a variety of models in different contexts that are freely available on `gensim`. We can take a look at a few of them by looking in the `gensim` downloader:" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "9042101c", 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "gensim_models = list(api.info()['models'].keys())\n", 132 | "print(gensim_models)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "id": "5b25caa4", 138 | "metadata": {}, 139 | "source": [ 140 | "We are going to use the `word2vec-google-news-300` model: this is a word embedding model that is trained on Google News, where the embedding is 300 dimensions. Downloading this might take a while! The word embedding model is nearly 2 GB. " 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "28406e56-9728-40df-a370-aafb9cce12d9", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "wv = api.load('word2vec-google-news-300')" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "4455f5da", 156 | "metadata": {}, 157 | "source": [ 158 | "How many word vectors are available in this word embedding model? We can access the `index_to_key` member variable to find out:" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "id": "41831a2b", 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "n_words = len(wv.index_to_key)\n", 169 | "print(f\"Number of words: {n_words}\")\n", 170 | "print(wv.index_to_key[:20])" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "id": "0ac1e2ab", 176 | "metadata": {}, 177 | "source": [ 178 | "The model is trained using a vocabulary of size 3 million! This is a huge model, which takes hours to train. This is why we used a pre-trained model - we likely don't have the resources to train this on our local machines.\n", 179 | "\n", 180 | "Accessing the actual word vectors can be done by treating the word vector model as a dictionary. For example, let's take a look at the word vector for `\"banana\"`:" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "id": "c160fd83", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "print(wv[\"banana\"])\n", 191 | "print(wv[\"banana\"].size)" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "id": "f75bb9eb", 197 | "metadata": {}, 198 | "source": [ 199 | "As promised, the word vector is 300-dimensional. Looking at the actual values of the vector is pretty uninformative - the values appear to be random floats. However, now that the word has been transformed into a vector, we can more easily perform computations on it that correspond to semantic operations. Let's take a look at a few examples." 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "id": "47060f92", 205 | "metadata": {}, 206 | "source": [ 207 | "## Word Similarity" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "id": "7673852e", 213 | "metadata": {}, 214 | "source": [ 215 | "A semantic question we can ask is that are similar to \"banana\". How does word similarity look in vector operations? We'd expect similar words to have vectors that are closer to each other in vector space.\n", 216 | "\n", 217 | "There are many metrics of vector similarity - one of the most useful ones is the [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity). It has a range of 0 to 1, with orthogonal vectors have a cosine similarity of 0, and parallel vectors having a cosine similarity of 1. `gensim` provides a function that lets us find the most similar vectors to a queried vector - let's give it a shot! " 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "id": "35ed3da8", 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "wv.most_similar('banana')" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "id": "869df615", 233 | "metadata": {}, 234 | "source": [ 235 | "The most similar vectors to \"banana\" are other fruits and foods! These are conceptual relationships that are reflected in the word embedding that we did not explicitly train in the model. Let's try another, more abstract word:" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "id": "9cd0e108-ebd2-4864-b436-7ff9f717cbf6", 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "wv.most_similar('happy')" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "id": "da4a9a34", 251 | "metadata": {}, 252 | "source": [ 253 | "We see synonyms of \"happy\", and even an antonym (\"disappointed\"). " 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "id": "b49ce27b", 259 | "metadata": {}, 260 | "source": [ 261 | "## Challenge 1\n", 262 | "\n", 263 | "Look up the `doesnt_match` function in `gensim`'s documentation. Use this function to identify which word doesn't match in the following group:\n", 264 | "\n", 265 | "banana, apple, strawberry, happy\n", 266 | "\n", 267 | "Then, try it on groups of words that you choose. Here are some suggestions:\n", 268 | "\n", 269 | "1. A group of fruits, and a vegetable. Can it identify that the vegetable doesn't match?\n", 270 | "2. A group of vehicles that travel by land, and a vehicle that travels by air (e.g., a plane or helicopter). Can it identify the vehicle that flies?\n", 271 | "3. A group of scientists (e.g., biologist, physicist, chemist, etc.) and a person who does not study an empirical science (e.g., an artist). Can it identify the occupation that is not science based?\n", 272 | "\n", 273 | "To be clear, `word2vec` does not learn the precise nature of the differences between these groups. However, the semantic differences correspond to similar words appearing near each other in large corpora." 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "id": "779b2d08-83b7-49ed-a917-97e10e34221e", 279 | "metadata": {}, 280 | "source": [ 281 | "## Word Analogies" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "id": "b3a5bb85", 287 | "metadata": {}, 288 | "source": [ 289 | "One of the most famous usages of `word2vec` is via word analogies. For example:\n", 290 | "\n", 291 | "`Paris : France :: Berlin : Germany`\n", 292 | "\n", 293 | "Here, the analogy is between (Paris, France) and (Berlin, Germany), with \"capital city\" being the concept that connects them. We can abstract the \"analogy\" relationship to vector modeling. Let's pretend we're working with each of the vectors. Then, the analogy is\n", 294 | "\n", 295 | "$\\mathbf{v}_{\\text{France}} - \\mathbf{v}_{\\text{Paris}} \\approx \\mathbf{v}_{\\text{Germany}} - \\mathbf{v}_{\\text{Berlin}}.$\n", 296 | "\n", 297 | "The vector difference here represents the notion of \"capital city\". Presumably, going from the Paris vector to the France vector (i.e., the vector difference) will be the same as going from the Berlin vector to the Germany vector, if that difference carries similar semantic meaning.\n", 298 | "\n", 299 | "Let's test this directly. We'll do so by rewriting the above expression:\n", 300 | "\n", 301 | "$\\mathbf{v}_{\\text{France}} - \\mathbf{v}_{\\text{Paris}} + \\mathbf{v}_{\\text{Berlin}} \\approx \\mathbf{v}_{\\text{Germany}}.$\n", 302 | "\n", 303 | "We'll calculate the difference between Paris and France, add on Germany, and find the closest vector to that quantity. Notice that, in all these operations, we set `norm=True`, and renormalize. That's because different vectors might be of different lengths, so the normalization puts everything on a common scale." 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "id": "5556ad23", 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "# Calculate \"capital city\" vector difference\n", 314 | "difference = wv.get_vector('France', norm=True) - wv.get_vector('Paris', norm=True) \n", 315 | "# Add on Berlin\n", 316 | "difference += wv.get_vector('Berlin', norm=True)\n", 317 | "# Renormalize vector\n", 318 | "difference /= np.linalg.norm(difference)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "id": "a43aa6cd", 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "# What is the most similar vector?\n", 329 | "wv.most_similar(difference)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "id": "5e9ecfe0", 335 | "metadata": {}, 336 | "source": [ 337 | "Germany is the most similar! So, word analogies seem possible with `word2vec`.\n", 338 | "\n", 339 | "Carrying out these operations can be done in one fell swoop with the `most_similar` function. Check the documentation for this function. What do the `positive` and `negative` arguments mean?" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "id": "d2b1de6f", 345 | "metadata": {}, 346 | "source": [ 347 | "## Challenge 2\n", 348 | "\n", 349 | "Carry out the following word analogies:\n", 350 | "\n", 351 | "1. Mouse : Mice :: Goose : ?\n", 352 | "2. Kangaroo : Joey :: Cat : ?\n", 353 | "3. United States : Dollar :: Mexico : ?\n", 354 | "4. Happy : Sad :: Up : ?\n", 355 | "5. California : Sacramento :: Canada : ?\n", 356 | "6. California : Sacramento :: Washington : ?\n", 357 | "\n", 358 | "What about something more abstract, such as:\n", 359 | "\n", 360 | "7. United States : hamburger :: Canada : ?\n", 361 | "\n", 362 | "Some work well, and others don't work as well. Try to come up with your own analogies!" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "id": "3a3bef04-6c7a-4332-9a5d-ce4be72bd52c", 368 | "metadata": {}, 369 | "source": [ 370 | "# Creating Custom Word Embeddings" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "id": "5397cb46-3c81-49e9-b7bb-37efe55803d4", 376 | "metadata": {}, 377 | "source": [ 378 | "In the previous example, we used a *pretrained* word embedding. That is, the word embedding was already trained using a very large corpus from Google News. What about when we want to train our own word embeddings from a new corpus?\n", 379 | "\n", 380 | "We can do that using `gensim` as well. However, if the corpus is large, training becomes very computationally taxing. So, we'll try training our own word embeddings, but on a much smaller corpus. Specifically, we'll return to one you should recognize: the airline tweets corpus!\n", 381 | "\n", 382 | "Let's go ahead and get set up by importing the dataset and preprocessing, as we did in Part 2." 383 | ] 384 | }, 385 | { 386 | "cell_type": "code", 387 | "execution_count": null, 388 | "id": "9dda1c4d-5006-422d-a3c6-857d59a08f02", 389 | "metadata": {}, 390 | "outputs": [], 391 | "source": [ 392 | "tweets_path = '../data/airline_tweets.csv'\n", 393 | "tweets = pd.read_csv(tweets_path, sep=',')" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "id": "d8a22b90-c66d-4ab0-a164-bd30c67241af", 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "def preprocess(text):\n", 404 | " \"\"\"Preprocesses a string.\"\"\"\n", 405 | " # Lowercase\n", 406 | " text = text.lower()\n", 407 | " # Replace URLs\n", 408 | " url_pattern = r'https?:\\/\\/.*[\\r\\n]*'\n", 409 | " url_repl = ' URL '\n", 410 | " text = re.sub(url_pattern, url_repl, text)\n", 411 | " # Replace digits\n", 412 | " digit_pattern = '\\d+'\n", 413 | " digit_repl = ' DIGIT '\n", 414 | " text = re.sub(digit_pattern, digit_repl, text)\n", 415 | " # Replace hashtags\n", 416 | " hashtag_pattern = r'(?:^|\\s)[##]{1}(\\w+)'\n", 417 | " hashtag_repl = ' HASHTAG '\n", 418 | " text = re.sub(hashtag_pattern, hashtag_repl, text)\n", 419 | " # Replace users\n", 420 | " user_pattern = r'@(\\w+)'\n", 421 | " user_repl = ' USER '\n", 422 | " text = re.sub(user_pattern, user_repl, text)\n", 423 | " # Remove blank spaces\n", 424 | " blankspace_pattern = r'\\s+'\n", 425 | " blankspace_repl = ' '\n", 426 | " text = re.sub(blankspace_pattern, blankspace_repl, text).strip()\n", 427 | " return text" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "id": "bfb5e805-3750-449c-ac6e-6fee95c0896c", 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "tweets['text_processed'] = tweets['text'].apply(lambda x: preprocess(x))\n", 438 | "tweets['text_processed'].head()" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "id": "bbb98145-a9ae-47ec-9928-99d850e57586", 444 | "metadata": {}, 445 | "source": [ 446 | "To create our own model, we need to import the `Word2Vec` module from `gensim`:" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "id": "126c3ba3-c322-43f2-ae73-f0244a68b65f", 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "from gensim.models import Word2Vec" 457 | ] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "id": "07f07191-ce87-46a2-bd09-2729312f25a4", 462 | "metadata": {}, 463 | "source": [ 464 | "You can check out the documentation for this module [here](https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec). The main input to `Word2Vec` is a `sentences` argument, which consists of a list of lists: the outer list enumerates the documents, and the inner list enumerates the tokens within in each list. So, we need to run a word tokenizer on each of the tweets. Let's use `nltk`'s word tokenizer:" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "id": "4c5a6831-5b42-4ed3-99b8-3241b84a4785", 471 | "metadata": {}, 472 | "outputs": [], 473 | "source": [ 474 | "from nltk.tokenize import word_tokenize" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "id": "efe2ca18-d18c-4248-924e-92fe5c9f97cc", 481 | "metadata": {}, 482 | "outputs": [], 483 | "source": [ 484 | "sentences = [word_tokenize(tweet) for tweet in tweets['text_processed']]" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": null, 490 | "id": "5a6e0abf-8a99-49f1-aaf5-ea21cb56cf50", 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "sentences[0]" 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "id": "b273230a-5c31-4156-a183-53fdd7aecc78", 500 | "metadata": {}, 501 | "source": [ 502 | "Now, we train the model. We are going to use CBOW to train the model since it's better suited for smaller datasets. Take note of what other arguments we set:" 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": null, 508 | "id": "ceaf88ad-677f-47de-bde5-cca609c7a243", 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [ 512 | "model = Word2Vec(\n", 513 | " sentences=sentences,\n", 514 | " vector_size=30,\n", 515 | " window=5,\n", 516 | " min_count=1,\n", 517 | " sg=0)" 518 | ] 519 | }, 520 | { 521 | "cell_type": "markdown", 522 | "id": "7997232a-2337-4709-87c0-f8e40971d1a1", 523 | "metadata": {}, 524 | "source": [ 525 | "The model is now trained! Let's take a look at some word vectors. We can access them using the `wv` attribute:" 526 | ] 527 | }, 528 | { 529 | "cell_type": "code", 530 | "execution_count": null, 531 | "id": "91bbdcaa-e04c-42fa-8b28-9b892f7abe89", 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [ 535 | "len(model.wv)" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": null, 541 | "id": "4f1b2108-6022-4e78-ac4b-56bf80665d15", 542 | "metadata": {}, 543 | "outputs": [], 544 | "source": [ 545 | "model.wv['worst']" 546 | ] 547 | }, 548 | { 549 | "cell_type": "markdown", 550 | "id": "bc5b9a5f-3d8c-4e61-a24f-23ed9bda530d", 551 | "metadata": {}, 552 | "source": [ 553 | "Let's try running a `most_similar` query to see what we end up with:" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": null, 559 | "id": "4578c24a-b913-4254-8a6c-ac4de77c878c", 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "model.wv.most_similar('worst')" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "id": "fde0ac2f-bac5-40bf-8f3b-81031183c9da", 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "model.wv.most_similar('great')" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": null, 579 | "id": "a16554ab-06c8-4fc4-8d2f-1e134263bf8e", 580 | "metadata": {}, 581 | "outputs": [], 582 | "source": [ 583 | "model.wv.most_similar('united')" 584 | ] 585 | }, 586 | { 587 | "cell_type": "markdown", 588 | "id": "d8e92543-087b-4ec6-b490-fb5f14dcec77", 589 | "metadata": {}, 590 | "source": [ 591 | "The `word2vec` model learned these relationships from the roughly 11,000 tweets in the corpus. These relationships look similar to some in the Google News word embeddings, have some differences that stem from the particular nature of the corpus, and the smaller number of documents." 592 | ] 593 | }, 594 | { 595 | "cell_type": "markdown", 596 | "id": "de9b7584-91cc-43bf-8279-400c1bf3ccf1", 597 | "metadata": {}, 598 | "source": [ 599 | "## Challenge 3\n", 600 | "\n", 601 | "Try experimenting with different numbers of vector sizes, window sizes, and other parameters available in the `Word2Vec` module. Additionally, try training using skip-grams rather than CBOW." 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "id": "5d2bb2f9-1cd8-4e8c-b121-d087d7449bbe", 607 | "metadata": {}, 608 | "source": [ 609 | "# Classifying with Trained Embeddings" 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "id": "44ce2d62-3d5b-4943-9266-e3f1241281cf", 615 | "metadata": {}, 616 | "source": [ 617 | "In the previous module, we used the airline tweets dataset to perform sentiment classification: we tried to classify the sentiment of a text given the bag-of-words representation. Can we do something similar with a word embedding representation?\n", 618 | "\n", 619 | "In the word embedding representation, we have an $N$-dimensional vector for each word in a tweet. How can we come up with a representation for the entire tweet?\n", 620 | "\n", 621 | "The simplest approach we could take is to simply average the vectors together to come up with a \"tweet representation\". Let's see how this works for predicting sentiment classification.\n", 622 | "\n", 623 | "First, we need to subset the dataset into the tweets which only have positive or negative sentiment:" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": null, 629 | "id": "5ae7a379-1f02-417e-9dbd-76af895e50af", 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [ 633 | "tweets_binary = tweets[tweets['airline_sentiment'] != 'neutral']\n", 634 | "y = tweets_binary['airline_sentiment']\n", 635 | "print(y.value_counts(normalize=True))" 636 | ] 637 | }, 638 | { 639 | "cell_type": "markdown", 640 | "id": "35def02f-4232-4643-b427-1fedbf472905", 641 | "metadata": {}, 642 | "source": [ 643 | "Now, we need to compute the feature matrix. We will query the word vector in each tweet, and come up with an average for the sample:" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": null, 649 | "id": "c0588d51-6d9e-4566-919b-2d75fd4e9e6d", 650 | "metadata": {}, 651 | "outputs": [], 652 | "source": [ 653 | "vector_size = 30\n", 654 | "X = np.zeros((len(y), vector_size))\n", 655 | "\n", 656 | "# Enumerate over tweets\n", 657 | "for idx, tweet in enumerate(tweets_binary['text_processed']):\n", 658 | " # Tokenize the current tweet\n", 659 | " tokens = word_tokenize(tweet)\n", 660 | " n_tokens = len(tokens)\n", 661 | " # Enumerate over tokens, obtaining word vectors\n", 662 | " for token in tokens:\n", 663 | " X[idx] += model.wv.get_vector(token)\n", 664 | " # Take the average\n", 665 | " X[idx] /= n_tokens" 666 | ] 667 | }, 668 | { 669 | "cell_type": "markdown", 670 | "id": "78f75f96-ca46-421b-b5a6-7cd47af0c997", 671 | "metadata": {}, 672 | "source": [ 673 | "As before, we'll proceed with splitting the data into train/test examples. We'll bring back the logistic fitter function from before, with some small changes." 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": null, 679 | "id": "654f81cb-a4ca-4c4d-a7b9-376b6b974988", 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [ 683 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": null, 689 | "id": "3d0f37a2-0712-4d46-93ab-6bdacba845b0", 690 | "metadata": {}, 691 | "outputs": [], 692 | "source": [ 693 | "def fit_logistic_regression(X, y):\n", 694 | " \"\"\"Fits a logistic regression model to provided data.\"\"\"\n", 695 | " model = LogisticRegressionCV(\n", 696 | " Cs=10,\n", 697 | " penalty='l2',\n", 698 | " max_iter=1000,\n", 699 | " cv=5,\n", 700 | " refit=True).fit(X, y)\n", 701 | " return model" 702 | ] 703 | }, 704 | { 705 | "cell_type": "markdown", 706 | "id": "b8126b42-f8ca-49af-8a8f-f243212478c9", 707 | "metadata": {}, 708 | "source": [ 709 | "We then run the fit, and evaluate it!" 710 | ] 711 | }, 712 | { 713 | "cell_type": "code", 714 | "execution_count": null, 715 | "id": "f00d9f99-c44e-4b91-b905-32b70ce6674f", 716 | "metadata": {}, 717 | "outputs": [], 718 | "source": [ 719 | "# Fit the logistic regression model\n", 720 | "fitter = fit_logistic_regression(X_train, y_train)" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": null, 726 | "id": "c59c916c-ba50-4336-b345-6e9ef29b27fb", 727 | "metadata": {}, 728 | "outputs": [], 729 | "source": [ 730 | "print(f\"Training accuracy: {fitter.score(X_train, y_train)}\")\n", 731 | "print(f\"Test accuracy: {fitter.score(X_test, y_test)}\")" 732 | ] 733 | }, 734 | { 735 | "cell_type": "markdown", 736 | "id": "40e7141f-2df5-4f1a-a356-aac82c44f982", 737 | "metadata": {}, 738 | "source": [ 739 | "While this performance is pretty good, it's definitely not as good as the bag-of-words representation we used in the previous module. There are few reasons this might be the case:\n", 740 | "\n", 741 | "1. We used a word embedding on a relatively small corpus. A word embedding obtained from a very large corpus would perform better. The tricky part in doing this is that our smaller corpus may have some niche tokens that are not in the larger model, so we'd have to work around that.\n", 742 | "2. We simply averaged word embeddings across tokens. When doing this, we lose meaning in the ordering of words. Other methods, such as `doc2vec`, have been proposed to address these concerns.\n", 743 | "3. Word embeddings might be an overly complicated approach for the task at hand. In a tweet aimed at an airline, a person needs to convey their sentiment in only 140 characters. So they are more likely to use relatively simple words that easily convey sentiment, making a bag-of-words a natural approach.\n", 744 | "\n", 745 | "It's important to note that we also lose out on the interpretability of the logistic regression model, because the actual dimensions of each word vector do not themselves have any meaning. \n", 746 | "\n", 747 | "Moral of the story: word embeddings are great, but always start with the simpler model! This is a good way to baseline other approaches, and it might actually work pretty well!" 748 | ] 749 | }, 750 | { 751 | "cell_type": "markdown", 752 | "id": "e443ff3f-bc6d-4190-b7aa-109d7909a242", 753 | "metadata": {}, 754 | "source": [ 755 | "## Challenge 4\n", 756 | "\n", 757 | "Write a function that performs the pipeline of building a `word2vec` model and constructing a design matrix. Use this function to try and see if you can change the performance of the model with other parameters (vector sizes, window sizes, etc.)." 758 | ] 759 | } 760 | ], 761 | "metadata": { 762 | "kernelspec": { 763 | "display_name": "Python 3 (ipykernel)", 764 | "language": "python", 765 | "name": "python3" 766 | }, 767 | "language_info": { 768 | "codemirror_mode": { 769 | "name": "ipython", 770 | "version": 3 771 | }, 772 | "file_extension": ".py", 773 | "mimetype": "text/x-python", 774 | "name": "python", 775 | "nbconvert_exporter": "python", 776 | "pygments_lexer": "ipython3", 777 | "version": "3.8.12" 778 | } 779 | }, 780 | "nbformat": 4, 781 | "nbformat_minor": 5 782 | } 783 | -------------------------------------------------------------------------------- /lessons/04_topic_modeling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a5aa27b3-30f9-4cd4-80e2-b9dd6460188a", 6 | "metadata": {}, 7 | "source": [ 8 | "# Python Text Analysis: Topic Modeling\n", 9 | "\n", 10 | "We've considered so far how to create numerical representations of words. We've even performed sentiment classification with those numerical representations. How else can we leverage these numerical representations to elucidate structure from natural language?\n", 11 | "\n", 12 | "In this session, we're going to discuss *topic modeling*. In topic modeling, we aim to discover how the documents in a corpus may be modeled as a function of specific topics. This is not the same thing as direct clustering, though, in which we might directly assign each document to a particular cluster.\n", 13 | "\n", 14 | "Consider genre classification. Some books may neatly fall into one genre, such as mystery, science fiction, etc. However, other books may be considered as incorporating multiple genres. You might have a fantasy novel which has mystery components to it, or a romance novel set in the future. In these cases, we don't want to cluster the fantasy novel into a \"fantasy\" bucket, and the romance novel in a \"romance\" bucket. We'd instead like to have some measure of assigning various topics, with different magnitudes to documents. This is the goal of topic modeling.\n", 15 | "\n", 16 | "We will use two approaches to perform topic modeling on the same corpus: non-negative matrix factorization, and latent dirichlet allocation." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "6fb1b9bf-6ed6-4abb-aab1-61d9fc209587", 22 | "metadata": {}, 23 | "source": [ 24 | "## Dataset: 20 Newsgroups\n", 25 | "\n", 26 | "We will be using a new dataset called the **20 Newsgroups** dataset. You can find the original page for this dataset [here](http://qwone.com/~jason/20Newsgroups/).\n", 27 | "\n", 28 | "This dataset is comprised of around 18000 newsgroups posts on 20 topics. The split between the train and test set is based upon a messages posted before and after a specific date. The news groups are as follows, with specific labels indicated:\n", 29 | "\n", 30 | "* *Computers*\n", 31 | " * comp.graphics\n", 32 | " * comp.os.ms-windows.misc\n", 33 | " * comp.sys.ibm.pc.hardware\n", 34 | " * comp.sys.mac.hardware\n", 35 | " * comp.windows.x\n", 36 | "* *Recreation*\n", 37 | " * rec.autos\n", 38 | " * rec.motorcycles\n", 39 | " * rec.sport.baseball\n", 40 | " * rec.sport.hockey\n", 41 | "* *Science*\n", 42 | " * sci.crypt\n", 43 | " * sci.electronics\n", 44 | " * sci.med\n", 45 | " * sci.space\n", 46 | "* *Miscellaneous*\n", 47 | " * misc.forsale\n", 48 | "* *Politics*\n", 49 | " * talk.politics.misc\n", 50 | " * talk.politics.guns\n", 51 | " * talk.politics.mideast\n", 52 | "* *Religion*\n", 53 | " * talk.religion.misc\n", 54 | " * alt.atheism\n", 55 | " * soc.religion.christian\n", 56 | " \n", 57 | "Let's begin by importing the dataset. We'll use `scikit-learn` to do so." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "c10c372a-1297-4846-b767-1c4ed2be41e1", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "import matplotlib.pyplot as plt\n", 68 | "import numpy as np\n", 69 | "import pandas as pd\n", 70 | "\n", 71 | "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", 72 | "\n", 73 | "%matplotlib inline" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "id": "11e81a98-7080-4997-bc57-653f3c66a078", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "# Import fetcher function\n", 84 | "from sklearn.datasets import fetch_20newsgroups" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "id": "e7f8c845-a42d-42eb-84a6-a48614c91526", 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# Always check the documentation!\n", 95 | "full_data, labels = fetch_20newsgroups(\n", 96 | " subset='train',\n", 97 | " shuffle=True,\n", 98 | " random_state=1,\n", 99 | " remove=(\"headers\", \"footers\", \"quotes\"),\n", 100 | " return_X_y=True)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "id": "e9da61bd-2425-4d9d-8812-79d2290644f7", 106 | "metadata": {}, 107 | "source": [ 108 | "Let's take a look at some of the data samples:" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "4f720479-b0b0-4823-b3db-350727159985", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "print(full_data[5])\n", 119 | "print('\\n\\n--------\\n\\n')\n", 120 | "print(full_data[50])\n", 121 | "print('\\n\\n--------\\n\\n')\n", 122 | "print(full_data[1000])" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "id": "f90f2cb3-fc83-4c5b-b90a-f100e17da6b3", 128 | "metadata": {}, 129 | "source": [ 130 | "If we take a look at the labels, we see that they're integers, each specifying one of the 20 possible classes:" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "id": "8d35d8e9-8f3b-43de-84dc-3bcc2f0085e4", 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "print(np.unique(labels))\n", 141 | "print(labels.shape)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "id": "ca437a87-294f-4c3b-8cb1-bed40137973f", 147 | "metadata": {}, 148 | "source": [ 149 | "We can access the corresponding names of these labels by using a different keyword argument in the original `fetch_20newsgroups` call:" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "id": "5df7fdc8-5e1b-461d-8bde-50997763c986", 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "newsgroups = fetch_20newsgroups(\n", 160 | " subset='train',\n", 161 | " shuffle=True,\n", 162 | " random_state=1,\n", 163 | " remove=(\"headers\", \"footers\", \"quotes\"))" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "77f0e555-f7d7-42e4-b8c6-56219306a383", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "list(newsgroups)" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "id": "592d2040-3685-4a17-bb67-51d193d40e66", 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "newsgroups.target_names" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "id": "67b7b0db-88c6-4be3-9b1e-16a1db8d0ad2", 189 | "metadata": {}, 190 | "source": [ 191 | "To simplify our analysis a little bit, we'll only use a portion of this dataset, which will be set by the `n_subsamples` variable. However, you can feel free to adjust this if you want to use a bigger portion of the dataset." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "id": "c7bcdac2-6226-4659-a5a2-1b76a6dc130d", 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "n_subsamples = 2000\n", 202 | "data = full_data[:n_subsamples]" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "id": "2c901796-78b1-41a0-bda4-83b2f76aa7f8", 208 | "metadata": { 209 | "jp-MarkdownHeadingCollapsed": true, 210 | "tags": [] 211 | }, 212 | "source": [ 213 | "## Non-negative Matrix Factorization\n", 214 | "\n", 215 | "Non-negative matrix factorization (NMF) is a dimensionality reduction technique that can be used to perform topic modeling. It was first [introduced](https://www.nature.com/articles/44565) by Lee and Seung in 1999.\n", 216 | "\n", 217 | "What does *non-negative matrix factorization* mean? Non-negative implies we're not working with any negative numbers, and matrix factorization implies we're taking a matrix, and breaking it down into \"factors\".\n", 218 | "\n", 219 | "The matrix under consideration is going to be some numerical representation of the text. We've already considered one - the document term matrix! Specifically, we're going to build a TF-IDF matrix from the data - let's call this matrix $V$. Let's remind ourselves what $V$ is: it's a $D\\times T$ matrix, where $D$ is the number of documents, and $T$ is the number or terms, or tokens. \n", 220 | "\n", 221 | "![NMF](../images/nmf.png)\n", 222 | "\n", 223 | "The goal in NMF is to write this matrix as a product of two matrices,\n", 224 | "\n", 225 | "$$\n", 226 | "V \\approx WH\n", 227 | "$$\n", 228 | "\n", 229 | "where $W$ is a $D\\times K$ matrix and $H$ is a $K\\times T$ matrix. So, in matrix $W$, we can still consider rows as corresponding to documents, and in $H$, we can think of columns as corresponding to terms. But what about $K$, the inner dimension?\n", 230 | "\n", 231 | "We can think of $K$ as enumerating *topics*. If $K$ corresponds to topics, then each row of $H$ corresponds to a different topic. We can interpret $H$ as enumerating what contribution each *term* makes to each *topic*. For example, if the first row of the NMF only has non-zero entries for terms `soccer`, `basketball`, and `baseball`, we might reasonably conclude that the topic corresponds to \"sports\". The numbers for each entry indicate the contribution of that term to the topic - so if the topic is mainly baseball, it might have a higher value in that entry.\n", 232 | "\n", 233 | "What does this mean for $W$? It's detailing how each *document* (rows) break down into *topics* (columns). So, we can think of NMF as estimating the contribution different topics to a specific document. In newsgroups, we might expect there to be a large contribution from that \"sports\" topic above into the samples labeled as \"rec.sports.baseball\".\n", 234 | "\n", 235 | "The breakdown of the original matrix into $W$ and $H$ can only be interpreted if there are no negative entries in any matrix. We already know a TF-IDF DTM is going to be non-negative. If we guarantee that $W$ and $H$ are as well, we can quite literally think of this factorization is creating building blocks for each document!\n", 236 | "\n", 237 | "There's a small issue here: we can get the breakdown of documents into topics, but it's up to us to \"interpret\" what the topics might be. This can be tricky business, as we might see. Furthermore, we don't even know how many topics we should pick! There are procedures to identify a good number of topics, but at some level it is subjective.\n", 238 | "\n", 239 | "Let's try fitting an NMF to the newsgroups data. First, we need to use `TfidfVectorizer` to transform the data into a document term matrix (remember how to do this?)." 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "id": "d4325121-5c93-44f5-90c3-fd87df7ce095", 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "# Is this a good scenario to be removing stop words?\n", 250 | "n_tokens = 1000\n", 251 | "\n", 252 | "vectorizer = TfidfVectorizer(\n", 253 | " max_df=0.95,\n", 254 | " min_df=2,\n", 255 | " max_features=n_tokens,\n", 256 | " stop_words=\"english\")" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "id": "334a2faf-ea15-48e2-bbc9-8a80675c83c6", 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "# Perform vectorizing\n", 267 | "tfidf = vectorizer.fit_transform(data)\n", 268 | "tokens = vectorizer.get_feature_names_out()" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "id": "7d83112a-4563-4270-827c-610a388ffba7", 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "# How many samples, and how many tokens?\n", 279 | "tfidf.todense().shape" 280 | ] 281 | }, 282 | { 283 | "cell_type": "markdown", 284 | "id": "0cfd70fd-3b2e-4f8b-854c-34c59345193f", 285 | "metadata": {}, 286 | "source": [ 287 | "Let's look at the tokens with highest TF-IDF scores:" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "id": "09526020-84da-438e-a46b-afc765ace3ed", 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [ 297 | "tfidf_df = pd.DataFrame(\n", 298 | " tfidf.todense(),\n", 299 | " columns=tokens)\n", 300 | "tfidf_df.sum(axis=0).sort_values(ascending=False).head(30)" 301 | ] 302 | }, 303 | { 304 | "cell_type": "markdown", 305 | "id": "fadbd0bb-c2bd-4d58-9291-c17ad769c1ce", 306 | "metadata": {}, 307 | "source": [ 308 | "We can perform NMF using the `NMF` module from `scikit-learn`:" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "id": "b33adb99-30bb-4522-a8be-ae9f58fd1ea8", 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "from sklearn.decomposition import NMF" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": null, 324 | "id": "765e4544-b709-4cba-ad1e-86625839d5c9", 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "n_components = 10\n", 329 | "random_state = 1\n", 330 | "\n", 331 | "nmf = NMF(\n", 332 | " n_components=n_components,\n", 333 | " random_state=random_state,\n", 334 | " alpha=0.1,\n", 335 | " l1_ratio=0.5,\n", 336 | " init='nndsvda',\n", 337 | " max_iter=500).fit(tfidf)" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "id": "68485e19-089a-463c-af90-00b2ebefb70d", 343 | "metadata": {}, 344 | "source": [ 345 | "We can take a look at the $H$ matrix, or the topics, by examining the `components_` member variable:" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "id": "5c698fe2-8671-426d-9d65-d7962aff07f4", 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "print(nmf.components_.shape)" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "id": "a8005931-59a7-45a1-8904-ed7784885e97", 361 | "metadata": {}, 362 | "source": [ 363 | "Let's take a peek at the distribution of values in the first topic:" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "id": "01c1b83a-0be1-412d-8b16-9178cece4417", 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [ 373 | "fig, ax = plt.subplots(1, 1)\n", 374 | "\n", 375 | "ax.bar(x=np.arange(n_tokens), height=nmf.components_[0])" 376 | ] 377 | }, 378 | { 379 | "cell_type": "markdown", 380 | "id": "ec26cb5e-9ff0-41bc-a827-5a1fbe93ca93", 381 | "metadata": {}, 382 | "source": [ 383 | "This is nice, but it'd be nice to look at the top tokens - the large spikes we see there - and ignore all the smaller contributions to a topic. We'll use a plotter function which will nicely show the largest contributions to each topic, as well as what the corresponding tokens are:" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": null, 389 | "id": "003e9781-9a3b-4dbf-a4c9-55f60839ece0", 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [ 393 | "def plot_top_words(model, feature_names, n_top_words=10, n_row=2, n_col=5, normalize=False):\n", 394 | " \"\"\"Plots the top words from a topic model.\n", 395 | " \n", 396 | " Parameters\n", 397 | " ----------\n", 398 | " model : topic model object (e.g., LatentDirichletAllocation, NMF)\n", 399 | " The trained topic model. It should have a components_ attribute.\n", 400 | " feature_names : array-like of strings\n", 401 | " The names of each token, as a list or array.\n", 402 | " n_top_words : int\n", 403 | " The number of words to plot for each topic.\n", 404 | " n_row : int\n", 405 | " The number of rows in the plot.\n", 406 | " n_col : int\n", 407 | " The number of columns in the plot.\n", 408 | " normalize : boolean\n", 409 | " If True, normalizes the components so that they sum to 1 along samples.\n", 410 | " \"\"\"\n", 411 | " # Create figure\n", 412 | " fig, axes = plt.subplots(n_row, n_col, figsize=(3 * n_col, 5 * n_row), sharex=True)\n", 413 | " axes = axes.flatten()\n", 414 | " components = model.components_\n", 415 | " # Normalize components, if necessary\n", 416 | " if normalize:\n", 417 | " components = components / components.sum(axis=1)[:, np.newaxis]\n", 418 | " # Iterate over each topic\n", 419 | " for topic_idx, topic in enumerate(components):\n", 420 | " # Obtain the top words for each topic\n", 421 | " top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]\n", 422 | " # Get the token names\n", 423 | " top_features = [feature_names[i] for i in top_features_ind]\n", 424 | " # Get their values\n", 425 | " weights = topic[top_features_ind]\n", 426 | "\n", 427 | " # Plot the token weights as a bar plot\n", 428 | " ax = axes[topic_idx]\n", 429 | " ax.barh(top_features, weights, height=0.7)\n", 430 | " ax.set_title(f\"Topic {topic_idx +1}\", fontdict={\"fontsize\": 20})\n", 431 | " ax.invert_yaxis()\n", 432 | " ax.tick_params(axis=\"both\", which=\"major\", labelsize=20)\n", 433 | " \n", 434 | " # Customize plot\n", 435 | " for i in \"top right left\".split():\n", 436 | " ax.spines[i].set_visible(False)\n", 437 | "\n", 438 | " plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)\n", 439 | "\n", 440 | " return fig, axes" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "id": "de835650-7d53-4570-8281-3b4c4b3e482e", 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "fig, axes = plot_top_words(nmf, tokens)\n", 451 | "plt.show()" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "id": "cf55084e-a174-4356-a6dc-bac12b81f45c", 457 | "metadata": {}, 458 | "source": [ 459 | "What can we tell from this? For one, Topic 2 looks like it might be related to computers, Topic 3 looks very much like a topic related to religion, and Topic 8 looks like it's about sports. Note that, if you used a different random seed, these topics might look different for you.\n", 460 | "\n", 461 | "Some of the topics look a little bit harder to interpret, and that comes with the territory in topic modeling.\n", 462 | "\n", 463 | "We were able to get the $H$ matrix, but what about the $W$ matrix? To get this, we need to *transform* the DTM into the *basis* created by the NMF factorization. We can do this with the `transform` function:" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "id": "4c7aa63b-d2a5-444d-a8f8-7455887fe882", 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "W = nmf.transform(tfidf)\n", 474 | "print(W.shape)" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "id": "281d03cb-b5d2-454c-88e9-8f333815ef5c", 480 | "metadata": {}, 481 | "source": [ 482 | "Let's take a look at a random entry:" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "id": "22d67fec-5c41-4fa8-806d-37bbdefe9bb9", 489 | "metadata": {}, 490 | "outputs": [], 491 | "source": [ 492 | "entry = 999\n", 493 | "print(data[999])" 494 | ] 495 | }, 496 | { 497 | "cell_type": "markdown", 498 | "id": "26e39129-137d-443d-93e4-f2ec4f6a800e", 499 | "metadata": {}, 500 | "source": [ 501 | "The accompanying label for this entry can be obtained from the `newsgroups` variable we created earlier:" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "id": "77bb79d4-8eb5-4e39-bde3-957f778db917", 508 | "metadata": {}, 509 | "outputs": [], 510 | "source": [ 511 | "newsgroups.target_names[labels[999]]" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "id": "24da7ff8-501b-4f81-9f1b-116f4426de92", 517 | "metadata": {}, 518 | "source": [ 519 | "Let's examine the breakdown of this entry by topic:" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "id": "881c7cce-984c-4b1a-9221-05b5ab750300", 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "fig, ax = plt.subplots(1, 1)\n", 530 | "ax.bar(x=np.arange(n_components), height=W[999])\n", 531 | "ax.set_xticks(np.arange(n_components))\n", 532 | "ax.set_xticklabels(np.arange(n_components) + 1)\n", 533 | "ax.set_xlabel('Topic', fontsize=15)" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "id": "83e02528-7d53-4365-9589-1368ce0e5d50", 539 | "metadata": {}, 540 | "source": [ 541 | "What does this tell you?\n", 542 | "\n", 543 | "Notice that many of the topics have been zeroed out entirely - this is a product of enforcing *sparsity* in the model, and it stems from the `alpha` parameter in the NMF." 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "id": "aafdde48-56c6-4550-ad7a-f6d100e95694", 549 | "metadata": {}, 550 | "source": [ 551 | "---\n", 552 | "\n", 553 | "### Challenge 1: Exploring Hyperparameters in NMF\n", 554 | "\n", 555 | "The choice of 10 components was somewhat arbitrary. It was something we had to do before we could proceed with fitting the model to the data. This is what's known as a *hyperparameter*. There are other hyperparameters in the `NMF`. For example, the `alpha` values specifies to what degree we should force values to be set equal to zero.\n", 556 | "\n", 557 | "Try fitting the NMF with other variations of hyperparameters, and plot the resulting topics using the `plot_top_words` function. What do you notice?\n", 558 | "\n", 559 | "---" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "id": "98fb9ea7-073d-4520-aafc-48044cba841d", 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "# YOUR CODE HERE" 570 | ] 571 | }, 572 | { 573 | "cell_type": "markdown", 574 | "id": "5b4b4900-e5a5-4774-adff-7fd3f3146b9f", 575 | "metadata": {}, 576 | "source": [ 577 | "## Latent Dirichlet Allocation\n", 578 | "\n", 579 | "Latent Dirichlet Allocation (LDA) is a Bayesian model that captures how specific topics can generate documents. It was [introduced](https://jmlr.csail.mit.edu/papers/v3/blei03a.htmlhttps://jmlr.csail.mit.edu/papers/v3/blei03a.html) in machine learning by Blei et al. It is one of the oldest models applied to perform topic modeling.\n", 580 | "\n", 581 | "One significant difference between LDA and NMF is that LDA is a *generative* model. This means that it can be used to *generate* new documents, by sampling from it. Assume we have a number of topics $T$. Then, we generate a new document as follows:\n", 582 | "\n", 583 | "1. Choose a number of words $N$ according to a Poisson distribution. If you're not familiar with a Poisson distribution, don't worry - the only thing you need to know is that the outputs from a Poisson distribution can only be nonnegative integers (e.g., 0, 1, 2, 3 ...).\n", 584 | "2. Choose a vector of values $\\boldsymbol{\\theta}=(\\theta_1, \\theta_2, \\ldots, \\theta_T)$ from a Dirichlet distribution. The details of a Dirichlet distribution aren't too important other than that it guarantees all of the $\\theta_i$ add up to 1, and are positive. So, we can think of the $\\theta_i$ as proportions, or probabilities.\n", 585 | "3. For each of the $N$ words $w_n$:\n", 586 | "- Choose a topic $t_n$ according to a Multinomial distribution following $\\boldsymbol{\\theta}$. In other words, choose a topic according to the probabilities set by $\\boldsymbol{\\theta}$ (remember, we're thinking of these values as proportions, or probabilities).\n", 587 | "- Choose a word $w_n$ from a probability distribution $p(w_n|t_n)$ conditioned on $t_n$. This probability distribution is another Multinomial distribution.\n", 588 | "\n", 589 | "LDA does not model the order of the words, so in the end, it produces a collection of words - just like the bag of words.\n", 590 | "\n", 591 | "![lda](../images/lda.png)\n", 592 | "\n", 593 | "There's a lot of variables there, so let's consider a concrete example. Let's suppose we have two topics: soccer and basketball. These are $t_1$ and $t_2$. \n", 594 | "\n", 595 | "Some topics are more likely to contains words than others. For example, soccer is more likely to contain `liverpool` and `freekick`, but probably not `nba`. Basketball meanwhile will very likely contain `rebound` and `nba`. Furthermore, even though it's unlikely, a soccer topic might still refer to the `nba`. This unlikeliness is captured through the probabilities assigned in the distribution $p(w_n|t_n)$.\n", 596 | "\n", 597 | "Next, each document might consist of multiple \"proportions\" of topics. We've already seen this in NMF, only this time, LDA captures this via a probability distribution rather than a matrix operation. So, Document 1 might mainly be about Soccer, and not really reference basketball - this would be reflected in the probabilities $\\boldsymbol{\\theta}=(0.9, 0.1)$. Meanwhile, another document might equally reference soccer and basketball, so we'd need a different set of parameters $\\boldsymbol{\\theta}=(0.5, 0.5)$.\n", 598 | "\n", 599 | "Once again, we're going to use `scikit-learn` to perform LDA. This time, however, we'll use a `CountVectorizer`, since LDA explicitly models *counts*." 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "id": "e5840f2e-3d16-40b2-a005-ce7b38c7fa6f", 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "# Use a CountVectorizer\n", 610 | "n_tokens = 1000\n", 611 | "count_vectorizer = CountVectorizer(\n", 612 | " max_df=0.95,\n", 613 | " min_df=2,\n", 614 | " max_features=n_tokens,\n", 615 | " stop_words=\"english\")" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "id": "95c02cc0-e3c9-4d0f-bd11-48a698df8a99", 622 | "metadata": {}, 623 | "outputs": [], 624 | "source": [ 625 | "# Fit and transform CountVectorizer\n", 626 | "counts = count_vectorizer.fit_transform(data)\n", 627 | "print(counts.shape)" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": null, 633 | "id": "332778ad-4b3a-4e29-9dde-d1fea1f69769", 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [ 637 | "tokens = count_vectorizer.get_feature_names_out()" 638 | ] 639 | }, 640 | { 641 | "cell_type": "markdown", 642 | "id": "68a4fdb8-1c6f-486b-bece-42208b918970", 643 | "metadata": {}, 644 | "source": [ 645 | "This DTM is going to look very similar to the previous DTM, so let's proceed to the LDA fitting!" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "id": "2b53abe0-e419-45e7-9b51-14b276a25d64", 652 | "metadata": {}, 653 | "outputs": [], 654 | "source": [ 655 | "from sklearn.decomposition import LatentDirichletAllocation" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "id": "04e65d6b-3d87-463b-a56c-e5d0ccf48d9b", 662 | "metadata": {}, 663 | "outputs": [], 664 | "source": [ 665 | "n_components = 10\n", 666 | "random_state = 0\n", 667 | "\n", 668 | "lda = LatentDirichletAllocation(\n", 669 | " n_components=n_components,\n", 670 | " max_iter=5,\n", 671 | " learning_method=\"online\", # Use when dataset is large\n", 672 | " learning_offset=50.0, \n", 673 | " random_state=random_state)" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": null, 679 | "id": "ea61cb9e-7c02-4766-b958-263e4a36b739", 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [ 683 | "# Fit the LDA model!\n", 684 | "lda.fit(counts)" 685 | ] 686 | }, 687 | { 688 | "cell_type": "markdown", 689 | "id": "7362b9d7-d58d-423f-8940-6dc5cd726b02", 690 | "metadata": {}, 691 | "source": [ 692 | "How can we analyze the trained model? The `lda` object also comes with a `components_` variable, which corresponds to the topic word distribution. Let's plot these values using the function we created above:" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "id": "c5ebb041-5b6d-4f99-8518-50cd8ee43540", 699 | "metadata": {}, 700 | "outputs": [], 701 | "source": [ 702 | "# This time, we're normalizing - what does this do?\n", 703 | "fig, axes = plot_top_words(lda, tokens, normalize=True)\n", 704 | "plt.show()" 705 | ] 706 | }, 707 | { 708 | "cell_type": "markdown", 709 | "id": "d404af01-7ee8-4125-a86e-3e9244f84046", 710 | "metadata": {}, 711 | "source": [ 712 | "---\n", 713 | "\n", 714 | "### Challenge 2: Exploring Hyperparameters in LDA\n", 715 | "\n", 716 | "As in the case of NMF, try performing LDA with other variations of hyperparameters, and plot the resulting topics using the `plot_top_words` function. Use the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html) as a guide to choose different hyperparameters.\n", 717 | "\n", 718 | "---" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": null, 724 | "id": "8b3ce152-b9f3-4d4d-8d02-c3bbf6e6c6cc", 725 | "metadata": {}, 726 | "outputs": [], 727 | "source": [ 728 | "# YOUR CODE HERE" 729 | ] 730 | }, 731 | { 732 | "cell_type": "markdown", 733 | "id": "24e8eb97-027f-47db-a04a-4dea8d1b6ecc", 734 | "metadata": {}, 735 | "source": [ 736 | "## Topic Modeling as Dimensionality Reduction\n", 737 | "\n", 738 | "In both NMF and LDA, we broke down the documents into topics. This was, in effect, a *change in representation*. We went from a DTM representation, to a representation of *topics*. \n", 739 | "\n", 740 | "Because there are fewer topics than there are tokens, we can think of this as a *dimensionality reduction*. This is desirable for several reasons, the main one being that it's easier to interpret, say, 10 dimensions than it is to interpret 1000.\n", 741 | "\n", 742 | "This is computationally true, as well: once we get to higher dimensions, it's harder to compare different vectors with each other, because they generally end up all close to orthogonal. This is known as the *curse of dimensionality*.\n", 743 | "\n", 744 | "Let's first transform the counts into the topic representation:" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": null, 750 | "id": "e28b5b78-2d74-4de2-b14c-8f434530f55e", 751 | "metadata": {}, 752 | "outputs": [], 753 | "source": [ 754 | "topic_representation = lda.transform(counts)\n", 755 | "topic_representation.shape" 756 | ] 757 | }, 758 | { 759 | "cell_type": "markdown", 760 | "id": "641fe529-7bdf-49d2-9de7-1a43d89dd197", 761 | "metadata": {}, 762 | "source": [ 763 | "We're going to use the cosine similarity to calculate the similarity between pairs of documents (remember this from Word Embeddings?). `scikit-learn` has a `cosine_similarity` function we can use:" 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": null, 769 | "id": "ce068c6a-aafb-4fdf-b1fe-5b742e472a97", 770 | "metadata": {}, 771 | "outputs": [], 772 | "source": [ 773 | "from sklearn.metrics.pairwise import cosine_similarity" 774 | ] 775 | }, 776 | { 777 | "cell_type": "markdown", 778 | "id": "d62d3dde-1f72-45b9-a403-a45d25c5cf14", 779 | "metadata": {}, 780 | "source": [ 781 | "Let's first calculate the similarity of the first few documents in the term representation:" 782 | ] 783 | }, 784 | { 785 | "cell_type": "code", 786 | "execution_count": null, 787 | "id": "30c68401-c26d-404e-b39c-03d97c5f21f4", 788 | "metadata": {}, 789 | "outputs": [], 790 | "source": [ 791 | "cosine_similarity(counts[:4])" 792 | ] 793 | }, 794 | { 795 | "cell_type": "markdown", 796 | "id": "f59c32e9-7813-4839-8b2d-91f81d4ec504", 797 | "metadata": {}, 798 | "source": [ 799 | "Not similar at all! Let's try in the topic representation:" 800 | ] 801 | }, 802 | { 803 | "cell_type": "code", 804 | "execution_count": null, 805 | "id": "a5e27063-66bb-409a-83be-f34b685aff2b", 806 | "metadata": {}, 807 | "outputs": [], 808 | "source": [ 809 | "cosine_similarity(topic_representation[:4])" 810 | ] 811 | }, 812 | { 813 | "cell_type": "markdown", 814 | "id": "320a8bb4-4075-4a9b-ac9b-ff0acbb0a0e3", 815 | "metadata": {}, 816 | "source": [ 817 | "There's a much bigger spread in the similarities, now. What about the rest of the documents?" 818 | ] 819 | }, 820 | { 821 | "cell_type": "markdown", 822 | "id": "abb994b3-d853-4660-b3bf-84af4f82e00f", 823 | "metadata": {}, 824 | "source": [ 825 | "---\n", 826 | "\n", 827 | "### Challenge 3: Finding Similar Documents\n", 828 | "\n", 829 | "Calculate the cosine similarity between all pairs of documents, and find the two documents whose cosine similarity is the highest. What are these documents? Do they seem similar?\n", 830 | "\n", 831 | "---" 832 | ] 833 | }, 834 | { 835 | "cell_type": "code", 836 | "execution_count": null, 837 | "id": "8c05d254-1120-4ae8-b29a-77f6d4e81517", 838 | "metadata": {}, 839 | "outputs": [], 840 | "source": [ 841 | "# YOUR CODE HERE" 842 | ] 843 | } 844 | ], 845 | "metadata": { 846 | "kernelspec": { 847 | "display_name": "Python 3 (ipykernel)", 848 | "language": "python", 849 | "name": "python3" 850 | }, 851 | "language_info": { 852 | "codemirror_mode": { 853 | "name": "ipython", 854 | "version": 3 855 | }, 856 | "file_extension": ".py", 857 | "mimetype": "text/x-python", 858 | "name": "python", 859 | "nbconvert_exporter": "python", 860 | "pygments_lexer": "ipython3", 861 | "version": "3.8.12" 862 | } 863 | }, 864 | "nbformat": 4, 865 | "nbformat_minor": 5 866 | } 867 | -------------------------------------------------------------------------------- /solutions/01_preprocessing_solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "794bac7d-1acb-43a7-8e17-1c9fa6e007e1", 6 | "metadata": {}, 7 | "source": [ 8 | "# Python Text Analysis: Fundamentals, Part 1 Solutions" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "270c7730-8f60-4449-9920-6030b6dcd1b9", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import nltk\n", 19 | "import os\n", 20 | "import pandas as pd\n", 21 | "import re\n", 22 | "import spacy\n", 23 | "\n", 24 | "from nltk.corpus import stopwords\n", 25 | "from nltk.stem import WordNetLemmatizer\n", 26 | "from nltk.tokenize import word_tokenize\n", 27 | "from string import punctuation" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "d90a7e63-04b7-4edb-bf3b-27a381d3d4ae", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "# Use here to create the filepath\n", 38 | "text_path = '../data/sowing_and_reaping.txt'\n", 39 | "\n", 40 | "# Open and read the text\n", 41 | "with open(text_path, 'r') as file:\n", 42 | " raw_text = file.read()\n", 43 | " \n", 44 | "# Remove the front and end matter\n", 45 | "sowing_and_reaping = raw_text[1114:684814]" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "id": "d2f446a0-61d7-4055-9acb-34b010f8572e", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# Import pandas\n", 56 | "import pandas as pd\n", 57 | "# Use pandas to import Tweets\n", 58 | "csv_path = '../data/airline_tweets.csv'\n", 59 | "tweets = pd.read_csv(csv_path, sep=',')" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "id": "fe1682c8-491d-411d-858b-03bb1221481b", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "example2_path = '../data/example2.txt'\n", 70 | "\n", 71 | "with open(example2_path) as file:\n", 72 | " example2 = file.read()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "cd8599eb-1e4d-43db-b62c-dce158a18ce0", 78 | "metadata": {}, 79 | "source": [ 80 | "---\n", 81 | "\n", 82 | "### Challenge 1: Working with Strings\n", 83 | "\n", 84 | "* What type of object is `sowing_and_reaping`?\n", 85 | "* How many characters are in `sowing_and_reaping`?\n", 86 | "* How can we get the first 1000 characters of `sowing_and_reaping`?\n", 87 | "\n", 88 | "---" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "1f8773e7-208c-4272-97e2-061c97860438", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "type(sowing_and_reaping)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "id": "b8d3572a-9cad-4bbc-bc2a-cec92dcd044a", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "len(sowing_and_reaping)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "c63cdfdf-e665-4c37-a755-8e110dbbdad4", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "sowing_and_reaping[:1000]" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "7f57a143-8876-4305-b3f1-c77001296919", 124 | "metadata": {}, 125 | "source": [ 126 | "---\n", 127 | "\n", 128 | "### Challenge 2: Reading in Many Files\n", 129 | "\n", 130 | "The `data` folder contains another folder called `amazon`, which contains many `csv` files of Amazon reviews. Use a `for` loop to read in each dataframe. Do the following:\n", 131 | "\n", 132 | "* We've provided a path to the `amazon` folder, and a list of all the file names within the folder using the `os.listdir()` function.\n", 133 | "* Iterate over all these files, and import them using `pd.read_csv()`. You will need to use `os.path.join()` to create the correct path. Additionally, you need to provide `pandas` with the column names since they are not included in the reviews. We have create the `column_names` variable for you.\n", 134 | "* Extract the text column from each dataframe, and add then to the `reviews` list. \n", 135 | "* How many totals reviews do you obtain?\n", 136 | "\n", 137 | "---" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "id": "6f743402-ace4-4b3b-8175-ba8b7ae87197", 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "# The os package has useful tools for file manipulation\n", 148 | "import os\n", 149 | "# Amazon review folder\n", 150 | "amazon_path = '../data/amazon')\n", 151 | "# List all the files in the amazon folder\n", 152 | "files = os.listdir(amazon_path)\n", 153 | "# Column names for each file\n", 154 | "column_names = ['id',\n", 155 | " 'product_id',\n", 156 | " 'user_id',\n", 157 | " 'profile_name',\n", 158 | " 'helpfulness_num',\n", 159 | " 'helpfulness_denom',\n", 160 | " 'score',\n", 161 | " 'time',\n", 162 | " 'summary',\n", 163 | " 'text']\n", 164 | "# Add each review text to this list\n", 165 | "reviews = []" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "id": "3c052ec5-2f03-4c45-9f3e-ac0bbf29da52", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "for file in files:\n", 176 | " # Check that the file is actually a CSV file\n", 177 | " if os.path.splitext(file)[1] == '.csv':\n", 178 | " full_path = os.path.join(amazon_path, file)\n", 179 | " reviews_df = pd.read_csv(full_path, sep=',', names=column_names) \n", 180 | " text = list(reviews_df['text'])\n", 181 | " reviews.extend(text)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "id": "6be9a6e3-4658-4556-9bc6-40e58b2045cd", 187 | "metadata": {}, 188 | "source": [ 189 | "---\n", 190 | "\n", 191 | "### Challenge 3: Text Cleaning with Multiple Steps\n", 192 | "\n", 193 | "In Challenge 1, we imported many Amazon reviews, and stored them in a variable called `reviews`. Each element of the list is a string, representing the text of a single review. For each review:\n", 194 | "\n", 195 | "* Replace any URLs and digits.\n", 196 | "* Make all characters lower case.\n", 197 | "* Strip all blankspaces.\n", 198 | "\n", 199 | "---" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "id": "f90b0867-06bb-4641-95b7-0eb584192b7c", 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "def preprocess(text):\n", 210 | " \"\"\"Preprocesses a string.\"\"\"\n", 211 | " # Lowercase\n", 212 | " text = text.lower()\n", 213 | " # Replace URLs\n", 214 | " url_pattern = r'https?:\\/\\/.*[\\r\\n]*'\n", 215 | " url_repl = ' URL '\n", 216 | " text = re.sub(url_pattern, url_repl, text)\n", 217 | " # Replace digits\n", 218 | " digit_pattern = '\\d+'\n", 219 | " digit_repl = ' DIGIT '\n", 220 | " text = re.sub(digit_pattern, digit_repl, text)\n", 221 | " # Remove blank spaces\n", 222 | " blankspace_pattern = r'\\s+'\n", 223 | " blankspace_repl = ' '\n", 224 | " text = re.sub(blankspace_pattern, blankspace_repl, text)\n", 225 | " # Last step: strip\n", 226 | " return text.strip()" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "id": "4231fc31-0683-41f3-859c-6b6a3618c03a", 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "processed_reviews = [preprocess(review) for review in reviews]" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "id": "00853c91-5118-4f2f-a575-c368c3534167", 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "print(processed_reviews[0])" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "id": "0fcfa89a-b347-4c5d-b72c-238b901730f0", 252 | "metadata": {}, 253 | "source": [ 254 | "---\n", 255 | "\n", 256 | "### Challenge 4: Tokenizing a Large Text\n", 257 | "\n", 258 | "Tokenize \"Sowing and Reaping\", which we imported at the beginning of this workshop. Use a method of your choice.\n", 259 | "\n", 260 | "Once you've tokenized, find all the unique words types (you might want the `set` function). Then, sort the resulting `set` object to create a vocabulary (you might want to use the `sorted` function).\n", 261 | "\n", 262 | "---" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "id": "15e86f01-23b4-4eac-bd03-845b62e655b4", 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "# nltk\n", 273 | "tokens = word_tokenize(sowing_and_reaping)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "id": "39d0a737-053c-4817-89bc-c06775ef22b0", 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "# spacy\n", 284 | "nlp = spacy.load(\"en_core_web_sm\")\n", 285 | "doc = nlp(sowing_and_reaping)\n", 286 | "tokens = [token.text for token in doc]" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "id": "c5697385-17b8-4498-b849-0c9abbe7b5d5", 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "unique_tokens = set(tokens)\n", 297 | "sorted_tokens = sorted(unique_tokens)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "id": "cb8372c1-909d-4076-8f2b-2dc37b66bc6d", 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "print(sorted_tokens[:100])" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "id": "1d08eb47-9f5b-4615-92c1-578379ec9835", 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [ 317 | "print(sorted_tokens[-100:])" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "id": "2a5ef86b-3326-4342-919d-f7e7251d12c1", 323 | "metadata": {}, 324 | "source": [ 325 | "---\n", 326 | "\n", 327 | "### Challenge 5: Apply a Lemmatizer to Text\n", 328 | "\n", 329 | "Lemmatize the tokenized `example2` text using the `nltk`'s `WordNetLemmatizer`.\n", 330 | "\n", 331 | "---" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "id": "99ee90b1-619f-435e-8548-5deff33bf882", 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "lemmatizer = WordNetLemmatizer()" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": null, 347 | "id": "b0df0d22-48f0-45a3-997f-90b047e9473b", 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "tokens = word_tokenize(example2)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "id": "d37407ca-a875-4107-9db3-213d21cc35ac", 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "lemmatized = [lemmatizer.lemmatize(token) for token in tokens]" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "id": "97eac648-f06e-4d5e-8097-4278fc4c3552", 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "print(lemmatized)" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "id": "7ec9b420-9050-40e7-9cbb-0961d99e3fc7", 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [ 381 | "print(example2)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "id": "390b9de5-7f14-48fa-b818-4e8cfe502d77", 387 | "metadata": {}, 388 | "source": [ 389 | "---\n", 390 | "\n", 391 | "### Challenge 6: Putting it All Together\n", 392 | "\n", 393 | "Write a function called `preprocess()` that accepts a string and performs the following preprocessing steps:\n", 394 | "\n", 395 | "* Lowercase text.\n", 396 | "* Replace all URLs and numbers with their respective tokens.\n", 397 | "* Strip blankspace.\n", 398 | "* Tokenize.\n", 399 | "* Remove punctuation.\n", 400 | "* Remove stop words.\n", 401 | "* Lemmatize the tokens.\n", 402 | "\n", 403 | "Apply this function to `sowing_and_reaping`.\n", 404 | "\n", 405 | "---" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": null, 411 | "id": "f5f3154b-54a3-4458-8779-2859c9d594f8", 412 | "metadata": {}, 413 | "outputs": [], 414 | "source": [ 415 | "def preprocess(text):\n", 416 | " \"\"\"Preprocesses a string.\"\"\"\n", 417 | " # Lowercase\n", 418 | " text = text.lower()\n", 419 | " # Replace URLs\n", 420 | " url_pattern = r'(http|ftp|https):\\/\\/([\\w_-]+(?:(?:\\.[\\w_-]+)+))([\\w.,@?^=%&:\\/~+#-]*[\\w@?^=%&\\/~+#-])'\n", 421 | " url_repl = ' URL '\n", 422 | " text = re.sub(url_pattern, url_repl, text)\n", 423 | " # Replace digits\n", 424 | " digit_pattern = '\\d+'\n", 425 | " digit_repl = ' DIGIT '\n", 426 | " text = re.sub(digit_pattern, digit_repl, text)\n", 427 | " # Remove blank spaces\n", 428 | " blankspace_pattern = r'\\s+'\n", 429 | " blankspace_repl = ' '\n", 430 | " text = re.sub(blankspace_pattern, blankspace_repl, text).strip()\n", 431 | " # Tokenize\n", 432 | " tokens = word_tokenize(text)\n", 433 | " # Remove punctuation\n", 434 | " tokens = [token for token in tokens if token not in punctuation]\n", 435 | " # Remove stop words\n", 436 | " stop = stopwords.words('english')\n", 437 | " tokens = [token for token in tokens if token not in stop]\n", 438 | " # Lemmatize\n", 439 | " lemmatizer = WordNetLemmatizer()\n", 440 | " tokens = [lemmatizer.lemmatize(token) for token in tokens]\n", 441 | " return tokens" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "id": "3379311e-768e-4477-9879-c8eb9ad594b7", 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "processed = preprocess(sowing_and_reaping)" 452 | ] 453 | } 454 | ], 455 | "metadata": { 456 | "kernelspec": { 457 | "display_name": "Python 3 (ipykernel)", 458 | "language": "python", 459 | "name": "python3" 460 | }, 461 | "language_info": { 462 | "codemirror_mode": { 463 | "name": "ipython", 464 | "version": 3 465 | }, 466 | "file_extension": ".py", 467 | "mimetype": "text/x-python", 468 | "name": "python", 469 | "nbconvert_exporter": "python", 470 | "pygments_lexer": "ipython3", 471 | "version": "3.8.12" 472 | } 473 | }, 474 | "nbformat": 4, 475 | "nbformat_minor": 5 476 | } 477 | -------------------------------------------------------------------------------- /solutions/02_bag_of_words_solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a2c37c33-fd3c-4b3a-bbf5-661d29798421", 6 | "metadata": {}, 7 | "source": [ 8 | "# Python Text Analysis: Fundamentals, Part 2 Solutions" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "4f340930-458d-4c63-939f-91ef765b8ef4", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import matplotlib.pyplot as plt\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "import re\n", 22 | "import seaborn as sns\n", 23 | "\n", 24 | "from nltk.tokenize import word_tokenize\n", 25 | "from nltk.corpus import stopwords\n", 26 | "from sklearn.ensemble import RandomForestClassifier\n", 27 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", 28 | "from sklearn.linear_model import LogisticRegressionCV\n", 29 | "from sklearn.model_selection import train_test_split\n", 30 | "from string import punctuation\n", 31 | "\n", 32 | "%matplotlib inline" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "40b47991-a10b-4f6d-b4e9-188a4644d719", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# Import pandas\n", 43 | "# Use pandas to import tweets\n", 44 | "tweets_path = '../data/airline_tweets.csv')\n", 45 | "tweets = pd.read_csv(tweets_path, sep=',')" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "id": "984af902-74f5-4244-a511-08d33bdf6776", 51 | "metadata": {}, 52 | "source": [ 53 | "---\n", 54 | "\n", 55 | "### Challenge 1: Getting to Know the Data\n", 56 | "\n", 57 | "Use `pandas` to find out the following about the airline tweets:\n", 58 | "\n", 59 | "* How many tweets are in the dataset?\n", 60 | "* How many tweets are positive, neutral, and negative?\n", 61 | "* What *proportion* of tweets are positive, neutral, and negative?\n", 62 | "* Make a bar plot showing the proportion of tweet sentiments.\n", 63 | "\n", 64 | "If you have time, try the following:\n", 65 | "\n", 66 | "* How much time separates the earliest and latest tweets?\n", 67 | "* What gets more retweets: positive, negative, or neutral tweets?\n", 68 | "* Identify the airline whose tweets have the highest proportion of negative sentiment.\n", 69 | "\n", 70 | "---" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "id": "6f0a69c0-d541-4776-a4b7-9b5250a11db8", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "# How many tweets are in the dataset?\n", 81 | "tweets.shape[0]" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "id": "acf00946-3058-490e-bf31-74616e459ea4", 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "# How many tweets are positive, neutral, and negative?\n", 92 | "tweets['airline_sentiment'].value_counts()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "235dd7fc-8105-446e-87c4-f8a43425dfe9", 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# What *proportion* of tweets are positive, neutral, and negative?\n", 103 | "tweets['airline_sentiment'].value_counts(normalize=True)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "d2a04bbe-2fd0-4b88-bd72-a804c35575d8", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "# Make a bar plot showing the proportion of tweet sentiments\n", 114 | "sns.countplot(x=tweets['airline_sentiment'], order=['positive', 'neutral', 'negative'])" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "id": "b8f6c6c0-e3be-4507-8e05-a04c71d8a886", 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# How much time separates the earliest and latest tweets?\n", 125 | "sorted_by_time = pd.to_datetime(tweets['tweet_created'].sort_values())\n", 126 | "sorted_by_time.iloc[-1] - sorted_by_time.iloc[0]" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "id": "28f04d01-544e-4ffd-a3ae-1de68df7cb44", 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "# What gets more retweets: positive, negative, or neutral tweets?\n", 137 | "tweets.groupby('airline_sentiment')['retweet_count'].mean()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "id": "cfb5c45a-8fad-4515-b43c-629b52ab212c", 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "# Which airline receives the highest proportion of negative tweets?\n", 148 | "proportions = tweets.groupby(['airline', 'airline_sentiment']).size() / tweets.groupby('airline').size()\n", 149 | "proportions.unstack().sort_values('negative')" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "id": "df76ff67-d285-4148-a642-f8bd4ce0ca4f", 155 | "metadata": {}, 156 | "source": [ 157 | "---\n", 158 | "\n", 159 | "### Challenge 2: Creating a Preprocessing Pipeline for Social Media Data\n", 160 | "\n", 161 | "Write a function called `preprocess()` that performs the following on a text input:\n", 162 | "\n", 163 | "* Lowercase text.\n", 164 | "* Replace all URLs with the token \"URL\".\n", 165 | "* Replace all numbers with the token \"DIGIT\".\n", 166 | "* Replace hashtags with the token \"HASHTAG\".\n", 167 | "* Replace all users with the token \"USER\".\n", 168 | "* Remove blankspaces.\n", 169 | "\n", 170 | "We have provided regex patterns for each of the replacement steps in the following cells.\n", 171 | "\n", 172 | "Run your `preprocess()` function on `example_tweet` (two cells below), and when you think you have it working, apply it to the entire `text` column in the tweets DataFrame.\n", 173 | "\n", 174 | "---" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "id": "8052135f-bbbb-4c4f-b090-ab08ec3d0ad1", 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "# Apply your function to the following example\n", 185 | "example_tweet = \"lol @justinbeiber and @BillGates are like soo 2000 #yesterday #amiright saw it on https://twitter.com #yolo\"" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "id": "aa8ee2ff-94de-4352-afbb-b4ddfe44d35c", 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "def preprocess(text):\n", 196 | " \"\"\"Preprocesses a string.\"\"\"\n", 197 | " # Lowercase\n", 198 | " text = text.lower()\n", 199 | " # Replace URLs\n", 200 | " url_pattern = r'(http|ftp|https):\\/\\/([\\w_-]+(?:(?:\\.[\\w_-]+)+))([\\w.,@?^=%&:\\/~+#-]*[\\w@?^=%&\\/~+#-])'\n", 201 | " url_repl = ' URL '\n", 202 | " text = re.sub(url_pattern, url_repl, text)\n", 203 | " # Replace digits\n", 204 | " digit_pattern = '\\d+'\n", 205 | " digit_repl = ' DIGIT '\n", 206 | " text = re.sub(digit_pattern, digit_repl, text)\n", 207 | " # Replace hashtags\n", 208 | " hashtag_pattern = r'(?:^|\\s)[##]{1}(\\w+)'\n", 209 | " hashtag_repl = ' HASHTAG '\n", 210 | " text = re.sub(hashtag_pattern, hashtag_repl, text)\n", 211 | " # Replace users\n", 212 | " user_pattern = r'@(\\w+)'\n", 213 | " user_repl = ' USER '\n", 214 | " text = re.sub(user_pattern, user_repl, text)\n", 215 | " # Remove blank spaces\n", 216 | " blankspace_pattern = r'\\s+'\n", 217 | " blankspace_repl = ' '\n", 218 | " text = re.sub(blankspace_pattern, blankspace_repl, text).strip()\n", 219 | " return text" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "id": "4e301605-cdb6-4c0e-9dfd-6f736400be33", 226 | "metadata": {}, 227 | "outputs": [], 228 | "source": [ 229 | "# Test on example tweet\n", 230 | "preprocess(example_tweet)" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "id": "ee42e6b0-6a04-4b40-bf4e-4c2856eacf78", 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "# Apply to text column to create a new column\n", 241 | "tweets['text_processed'] = tweets['text'].apply(lambda x: preprocess(x))\n", 242 | "tweets['text_processed'].head()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "id": "8e25a7b6-b726-46e7-96c1-a022eca35876", 248 | "metadata": {}, 249 | "source": [ 250 | "---\n", 251 | "\n", 252 | "### Challenge 3: DTM Data Analysis\n", 253 | "\n", 254 | "* Print out the most infrequent words rather than the most frequent words. If you're not sure how, check the [documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html)!\n", 255 | "* Print the average number of times each word is used in a tweet.\n", 256 | "* Which non-hashtag, non-digit token appears the most in any given tweet? How many times does it appear? What is the original tweet?\n", 257 | "\n", 258 | "---" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "id": "f0d7df60-776a-4c32-a701-9bef6b352486", 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "vectorizer = CountVectorizer()\n", 269 | "counts = vectorizer.fit_transform(tweets['text_processed'])\n", 270 | "# Extract tokens\n", 271 | "tokens = vectorizer.get_feature_names_out()\n", 272 | "# Create DTM\n", 273 | "dtm = pd.DataFrame(data=counts.todense(),\n", 274 | " index=tweets.index,\n", 275 | " columns=tokens)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "id": "d5356fc1-da93-477f-8982-47a2a82f3ac4", 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "# Most infrequent tokens\n", 286 | "dtm.sum().sort_values(ascending=True).head(20)" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "id": "58c89580-601a-49b5-8037-dd1723d1ddf6", 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "# Average number of times each word is used in a tweet\n", 297 | "dtm.mean().sort_values(ascending=False).head(20)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "id": "f088720b-98d7-4d0e-8e62-5b91a098428f", 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "# Which token appears the most in any given tweet?\n", 308 | "counts = pd.DataFrame()\n", 309 | "counts['token'] = dtm.idxmax(axis=1)\n", 310 | "counts['number'] = dtm.max(axis=1)\n", 311 | "counts[(counts['token'] != 'digit')\n", 312 | " & (counts['token'] != 'hashtag')\n", 313 | " & (counts['token'] != 'user')].sort_values(\n", 314 | " 'number',\n", 315 | " ascending=False).head(10)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "id": "6eb4a1da-8e1a-49df-85bb-aed1a97106a8", 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "# Look at index 1214: \"worst\"\n", 326 | "tweets.iloc[1214]['text']" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "id": "53135016-92a6-4b6a-836f-0241234c3427", 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "# Look at index 3915: \"lt\"\n", 337 | "tweets.iloc[3915]['text']" 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "id": "a3840105-d53b-40f4-9a7e-90134b6b86d4", 343 | "metadata": {}, 344 | "source": [ 345 | "---\n", 346 | "\n", 347 | "### Challenge 4: Customizing the Vectorizer with `nltk` inputs\n", 348 | "\n", 349 | "If you look at the `CountVectorizer` documentation, you'll see that it can actually accept a custom `tokenizer` and `stop_words` list. \n", 350 | "\n", 351 | "Using what you learned in the previous workshop, create a `CountVectorizer` that utilizes the `nltk` word tokenizer and stop word list. How does the resulting DTM look different?\n", 352 | "\n", 353 | "---" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "id": "f3123645-1582-437d-92b2-db3233442efb", 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "# Get stop words\n", 364 | "stop_words = stopwords.words('english')\n", 365 | "# Create the vectorizer\n", 366 | "vectorizer = CountVectorizer(\n", 367 | " lowercase=True,\n", 368 | " tokenizer=word_tokenize,\n", 369 | " stop_words=stop_words,\n", 370 | " min_df=2,\n", 371 | " max_df=0.95)\n", 372 | "# Fit, transform, and get tokens\n", 373 | "counts = vectorizer.fit_transform(tweets['text_processed'])\n", 374 | "tokens = vectorizer.get_feature_names_out()\n", 375 | "# Create dataframe\n", 376 | "dtm = pd.DataFrame(data=counts.todense(),\n", 377 | " index=tweets.index,\n", 378 | " columns=tokens)\n", 379 | "print(dtm.shape)\n", 380 | "dtm.head()" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "id": "53695e2f-c113-4fb0-9ef0-a99523e5c132", 386 | "metadata": {}, 387 | "source": [ 388 | "---\n", 389 | "\n", 390 | "### Challenge 5\n", 391 | "\n", 392 | "Try developing a **multinomial logistic regression** model, to predict positive, negative, and neutral labels. We've provided you a fitter function below, but it's up to you to create new labels, train-test splits, and perform the fitting and evaluation!\n", 393 | "\n", 394 | "---" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "id": "f3c26312-5e13-4156-8af3-56107da98d3f", 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "def fit_multinomial_logistic_regression(X, y):\n", 405 | " \"\"\"Fits a logistic regression model to provided data.\"\"\"\n", 406 | " model = LogisticRegressionCV(\n", 407 | " multi_class='multinomial',\n", 408 | " Cs=10,\n", 409 | " penalty='l1',\n", 410 | " solver='saga',\n", 411 | " tol=1e-2,\n", 412 | " max_iter=50,\n", 413 | " cv=3,\n", 414 | " refit=True).fit(X, y)\n", 415 | " return model" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "id": "b00245f4-1831-4e0e-96d6-2abf08ef0d0d", 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "vectorizer = TfidfVectorizer(max_features=5000)\n", 426 | "dtm = vectorizer.fit_transform(tweets['text_processed'])\n", 427 | "X = np.asarray(dtm.todense())\n", 428 | "y = tweets['airline_sentiment']\n", 429 | "print(X.shape)" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "id": "0a045313-e68d-4be2-8360-3f9af28cccc4", 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "id": "843b8b4a-8b77-4ac6-843d-45caeecad2bf", 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "# This may take a while to run!\n", 450 | "model = fit_multinomial_logistic_regression(X_train, y_train)" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "id": "dce8f92c-4f2a-4c0d-ba4f-8ab50f9de90e", 457 | "metadata": {}, 458 | "outputs": [], 459 | "source": [ 460 | "print(f\"Training accuracy: {model.score(X_train, y_train)}\")\n", 461 | "print(f\"Test accuracy: {model.score(X_test, y_test)}\")" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "id": "d3b19e4f-fba6-46e3-8280-461582ca8624", 467 | "metadata": {}, 468 | "source": [ 469 | "---\n", 470 | "\n", 471 | "### Challenge 6\n", 472 | "\n", 473 | "Create a new fitter function that uses a `RandomForestClassifier`. How is the performance? Check the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) for more details.\n", 474 | "\n", 475 | "---" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": null, 481 | "id": "8303ec28-d6a2-4700-8ee8-30a9195485f9", 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "def fit_random_forest(X, y):\n", 486 | " \"\"\"Fits a random forest model to provided data.\"\"\"\n", 487 | " model = RandomForestClassifier(n_estimators=50).fit(X, y)\n", 488 | " return model" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "id": "e55f53b0-fb95-4505-8fac-a0fa38630382", 495 | "metadata": {}, 496 | "outputs": [], 497 | "source": [ 498 | "tweets_binary = tweets[tweets['airline_sentiment'] != 'neutral']\n", 499 | "vectorizer = TfidfVectorizer(max_features=5000)\n", 500 | "dtm = vectorizer.fit_transform(tweets_binary['text_processed'])\n", 501 | "X = np.asarray(dtm.todense())\n", 502 | "y = tweets_binary['airline_sentiment']\n", 503 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "id": "d3f81bc2-944f-47fa-adef-9e3ef840670a", 510 | "metadata": {}, 511 | "outputs": [], 512 | "source": [ 513 | "model = fit_random_forest(X_train, y_train)" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "id": "02295745-0611-40c4-863a-8ef0c9d58882", 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [ 523 | "# Overfitting a bit!\n", 524 | "print(f\"Training accuracy: {model.score(X_train, y_train)}\")\n", 525 | "print(f\"Test accuracy: {model.score(X_test, y_test)}\")" 526 | ] 527 | } 528 | ], 529 | "metadata": { 530 | "kernelspec": { 531 | "display_name": "Python 3 (ipykernel)", 532 | "language": "python", 533 | "name": "python3" 534 | }, 535 | "language_info": { 536 | "codemirror_mode": { 537 | "name": "ipython", 538 | "version": 3 539 | }, 540 | "file_extension": ".py", 541 | "mimetype": "text/x-python", 542 | "name": "python", 543 | "nbconvert_exporter": "python", 544 | "pygments_lexer": "ipython3", 545 | "version": "3.8.12" 546 | } 547 | }, 548 | "nbformat": 4, 549 | "nbformat_minor": 5 550 | } 551 | -------------------------------------------------------------------------------- /solutions/03_word_embeddings_solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "37f37149-c4d4-4bb2-be40-33caccccdc92", 6 | "metadata": {}, 7 | "source": [ 8 | "# Python Text Analysis: Word Embeddings Solutions" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "428bef9b-2ce9-456c-a260-9b72d79d6e85", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import numpy as np\n", 19 | "import gensim\n", 20 | "import gensim.downloader as api\n", 21 | "import pandas as pd\n", 22 | "import re\n", 23 | "\n", 24 | "from gensim.models import Word2Vec\n", 25 | "from nltk.tokenize import word_tokenize\n", 26 | "from sklearn.model_selection import train_test_split\n", 27 | "from sklearn.linear_model import LogisticRegressionCV" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "id": "99c923bb-7286-4d55-8002-684343a5a9cf", 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "wv = api.load('word2vec-google-news-300')" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "9cf4a748-d43e-4c76-8eaa-ad504e5c3a51", 43 | "metadata": {}, 44 | "source": [ 45 | "## Challenge 1\n", 46 | "\n", 47 | "Look up the `doesnt_match` function in `gensim`'s documentation. Use this function to identify which word doesn't match in the following group:\n", 48 | "\n", 49 | "banana, apple, strawberry, happy\n", 50 | "\n", 51 | "Then, try it on groups of words that you choose. Here are some suggestions:\n", 52 | "\n", 53 | "1. A group of fruits, and a vegetable. Can it identify that the vegetable doesn't match?\n", 54 | "2. A group of vehicles that travel by land, and a vehicle that travels by air (e.g., a plane or helicopter). Can it identify the vehicle that flies?\n", 55 | "3. A group of scientists (e.g., biologist, physicist, chemist, etc.) and a person who does not study an empirical science (e.g., an artist). Can it identify the occupation that is not science based?\n", 56 | "\n", 57 | "To be clear, `word2vec` does not learn the precise nature of the differences between these groups. However, the semantic differences correspond to similar words appearing near each other in large corpora." 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "63c4f076-005a-4176-93dd-60e681f6b805", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "wv.doesnt_match(['banana', 'apple', 'strawberry', 'happy'])" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "id": "bfa15ad5-b3fe-4871-8c07-bfa1bac58e11", 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "wv.doesnt_match(['banana', 'apple', 'strawberry', 'carrot'])" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "id": "cfe8838e-c48f-49f0-9a0c-f135e1fb7d19", 84 | "metadata": {}, 85 | "outputs": [], 86 | "source": [ 87 | "wv.doesnt_match(['car', 'bike', 'bus', 'plane'])" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "id": "bbc36a9d-08c2-47ca-8564-0c3d76fde4b4", 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "wv.doesnt_match(['biologist', 'physicist', 'chemist', 'artist'])" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "id": "4fcbee67-bfce-41bb-8640-e0bbdc742519", 103 | "metadata": {}, 104 | "source": [ 105 | "## Challenge 2\n", 106 | "\n", 107 | "Carry out the following word analogies:\n", 108 | "\n", 109 | "1. Mouse : Mice :: Goose : ?\n", 110 | "2. Kangaroo : Joey :: Cat : ?\n", 111 | "3. United States : Dollar :: Mexico : ?\n", 112 | "4. Happy : Sad :: Up : ?\n", 113 | "5. California : Sacramento :: Canada : ?\n", 114 | "6. California : Sacramento :: Washington : ?\n", 115 | "\n", 116 | "What about something more abstract, such as:\n", 117 | "\n", 118 | "7. United States : hamburger :: Canada : ?\n", 119 | "\n", 120 | "Some work well, and others don't work as well. Try to come up with your own analogies!" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": null, 126 | "id": "1e9c2318-2f3a-4a4f-937c-3968803656a3", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "wv.most_similar(positive=['mice', 'goose'], negative=['mouse'])" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "id": "e77d3aec-a703-46bc-98fd-4cccf3f3f454", 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "wv.most_similar(positive=['joey', 'cat'], negative=['kangaroo'])" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "id": "1fcd1a2a-4034-430c-82fe-7068ee335590", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "wv.most_similar(positive=['Dollar', 'Mexico'], negative=['United_States'])" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "id": "6e601ca0-3ec0-4458-a1bf-12dc8c21a1fc", 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "wv.most_similar(positive=['sad', 'up'], negative=['happy'])" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "id": "d887cadb-d4bc-4fc1-8a4f-f7b2bce74429", 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "wv.most_similar(positive=['Sacramento', 'Canada'], negative=['California'])" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": null, 176 | "id": "de077690-f6db-4590-9e44-afd789ec67ee", 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "wv.most_similar(positive=['Sacramento', 'Washington'], negative=['California'])" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "id": "7a45aded-bf29-4f1b-8265-30cadd113bfb", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "wv.most_similar(positive=['hamburger', 'Canada'], negative=['United_States'])" 191 | ] 192 | }, 193 | { 194 | "cell_type": "markdown", 195 | "id": "11a2a737-4617-4288-b09f-85a5d1139697", 196 | "metadata": {}, 197 | "source": [ 198 | "## Challenge 3\n", 199 | "\n", 200 | "Try experimenting with different numbers of vector sizes, window sizes, and other parameters available in the `Word2Vec` module. Additionally, try training using skip-grams rather than CBOW." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "id": "70d6c037-133b-4979-9aaa-830e11e29b6d", 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "tweets_path = '../data/airline_tweets.csv')\n", 211 | "tweets = pd.read_csv(tweets_path, sep=',')" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "id": "f6c63a20-da4f-4e3f-8ec2-2ee89a736d70", 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "def preprocess(text):\n", 222 | " \"\"\"Preprocesses a string.\"\"\"\n", 223 | " # Lowercase\n", 224 | " text = text.lower()\n", 225 | " # Replace URLs\n", 226 | " url_pattern = r'https?:\\/\\/.*[\\r\\n]*'\n", 227 | " url_repl = ' URL '\n", 228 | " text = re.sub(url_pattern, url_repl, text)\n", 229 | " # Replace digits\n", 230 | " digit_pattern = '\\d+'\n", 231 | " digit_repl = ' DIGIT '\n", 232 | " text = re.sub(digit_pattern, digit_repl, text)\n", 233 | " # Replace hashtags\n", 234 | " hashtag_pattern = r'(?:^|\\s)[##]{1}(\\w+)'\n", 235 | " hashtag_repl = ' HASHTAG '\n", 236 | " text = re.sub(hashtag_pattern, hashtag_repl, text)\n", 237 | " # Replace users\n", 238 | " user_pattern = r'@(\\w+)'\n", 239 | " user_repl = ' USER '\n", 240 | " text = re.sub(user_pattern, user_repl, text)\n", 241 | " # Remove blank spaces\n", 242 | " blankspace_pattern = r'\\s+'\n", 243 | " blankspace_repl = ' '\n", 244 | " text = re.sub(blankspace_pattern, blankspace_repl, text).strip()\n", 245 | " return text" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "id": "d5150121-08fe-4e50-8fa4-19fcbe96639c", 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "tweets['text_processed'] = tweets['text'].apply(lambda x: preprocess(x))\n", 256 | "tweets['text_processed'].head()" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "id": "e001db47-d08d-471d-a841-ffa849462a82", 263 | "metadata": {}, 264 | "outputs": [], 265 | "source": [ 266 | "sentences = [word_tokenize(tweet) for tweet in tweets['text_processed']]" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "id": "b2ae4512-cdd6-41e4-b798-ce538ded8010", 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "model = Word2Vec(\n", 277 | " sentences=sentences,\n", 278 | " vector_size=50,\n", 279 | " window=5,\n", 280 | " min_count=2,\n", 281 | " sg=1)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "id": "4129c620-8120-4634-87dd-790820de6928", 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "model.wv.most_similar('worst')" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "id": "3a5d01f9-2076-4820-9cf0-a578925fd9a0", 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "model.wv.most_similar('great')" 302 | ] 303 | }, 304 | { 305 | "cell_type": "markdown", 306 | "id": "72fdf27f-0212-4d76-a015-fc9bd29c1192", 307 | "metadata": {}, 308 | "source": [ 309 | "## Challenge 4\n", 310 | "\n", 311 | "Write a function that performs the pipeline of building a `word2vec` model and constructing a design matrix. Use this function to try and see if you can change the performance of the model with other parameters (vector sizes, window sizes, etc.)." 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "id": "25c2058d-acfd-475e-a6e0-6d1555e773e3", 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "tweets_binary = tweets[tweets['airline_sentiment'] != 'neutral']\n", 322 | "y = tweets_binary['airline_sentiment']" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "id": "44f6648b-4eaf-4a6f-a023-ff5b25bd880f", 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "def featurizer(documents, to_train, vector_size=50, window=6, sg=0):\n", 333 | " \"\"\"Computes a feature matrix from a document corpus.\"\"\"\n", 334 | " sentences = [word_tokenize(doc) for doc in documents]\n", 335 | " # Train word2vec\n", 336 | " model = Word2Vec(\n", 337 | " sentences=sentences,\n", 338 | " vector_size=vector_size,\n", 339 | " window=window,\n", 340 | " min_count=1,\n", 341 | " sg=sg)\n", 342 | " \n", 343 | " X = np.zeros((len(to_train), vector_size))\n", 344 | " # Enumerate over tweets\n", 345 | " for idx, doc in enumerate(to_train):\n", 346 | " # Tokenize the current tweet\n", 347 | " tokens = word_tokenize(doc)\n", 348 | " n_tokens = len(tokens)\n", 349 | " # Enumerate over tokens, obtaining word vectors\n", 350 | " for token in tokens:\n", 351 | " X[idx] += model.wv.get_vector(token)\n", 352 | " # Take the average\n", 353 | " X[idx] /= n_tokens\n", 354 | " return X" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "id": "27785ffa-0fce-4b8c-8e36-0fc1573810ec", 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "X = featurizer(tweets['text_processed'], tweets_binary['text_processed'], vector_size=80, window=6)" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "id": "71a3da82-91b0-45a9-a7fb-76832ed1315c", 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "id": "4f52dc37-f051-45bc-9b09-bc61d46960da", 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "def fit_logistic_regression(X, y):\n", 385 | " \"\"\"Fits a logistic regression model to provided data.\"\"\"\n", 386 | " model = LogisticRegressionCV(\n", 387 | " Cs=5,\n", 388 | " penalty='l2',\n", 389 | " max_iter=1000,\n", 390 | " tol=1e-2,\n", 391 | " cv=3,\n", 392 | " refit=True).fit(X, y)\n", 393 | " return model" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": null, 399 | "id": "a7e148d1-952e-4ff4-84ac-5f020ecfe29b", 400 | "metadata": {}, 401 | "outputs": [], 402 | "source": [ 403 | "# Fit the logistic regression model\n", 404 | "fitter = fit_logistic_regression(X_train, y_train)" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "id": "a67ce823-5899-4336-958a-fa7cabcc40c3", 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "print(f\"Training accuracy: {fitter.score(X_train, y_train)}\")\n", 415 | "print(f\"Test accuracy: {fitter.score(X_test, y_test)}\")" 416 | ] 417 | } 418 | ], 419 | "metadata": { 420 | "kernelspec": { 421 | "display_name": "Python 3 (ipykernel)", 422 | "language": "python", 423 | "name": "python3" 424 | }, 425 | "language_info": { 426 | "codemirror_mode": { 427 | "name": "ipython", 428 | "version": 3 429 | }, 430 | "file_extension": ".py", 431 | "mimetype": "text/x-python", 432 | "name": "python", 433 | "nbconvert_exporter": "python", 434 | "pygments_lexer": "ipython3", 435 | "version": "3.8.12" 436 | } 437 | }, 438 | "nbformat": 4, 439 | "nbformat_minor": 5 440 | } 441 | -------------------------------------------------------------------------------- /solutions/04_topic_modeling_solutions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "dd103089-a544-4396-9978-e879fc022d8d", 6 | "metadata": {}, 7 | "source": [ 8 | "# Python Text Analysis: Topic Modeling Solutions" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "98db9841-6916-48d0-8d43-487405b332a0", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import matplotlib.pyplot as plt\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "\n", 22 | "from sklearn.decomposition import LatentDirichletAllocation, NMF\n", 23 | "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", 24 | "from sklearn.metrics.pairwise import cosine_similarity\n", 25 | "\n", 26 | "%matplotlib inline" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "id": "61d87880-ec31-47d3-8bb4-cf2af3912f2f", 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "from sklearn.decomposition import NMF" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "id": "863f4a96-edc4-41bb-b9ce-44a847c709a9", 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# Import fetcher function\n", 47 | "from sklearn.datasets import fetch_20newsgroups" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "952e2bc0-a009-476e-9658-07a83508d96e", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "full_data, labels = fetch_20newsgroups(\n", 58 | " subset='train',\n", 59 | " shuffle=True,\n", 60 | " random_state=1,\n", 61 | " remove=(\"headers\", \"footers\", \"quotes\"),\n", 62 | " return_X_y=True)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "id": "1723d1f5-1c98-41c3-a28c-c571b7d8ccb0", 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "n_subsamples = 2000\n", 73 | "data = full_data[:n_subsamples]" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "id": "3206bfd7-ca4a-4a9a-a552-96299eab2664", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "def plot_top_words(model, feature_names, n_top_words=10, n_row=2, n_col=5, normalize=False):\n", 84 | " \"\"\"Plots the top words from a topic model.\n", 85 | " \n", 86 | " Parameters\n", 87 | " ----------\n", 88 | " model : topic model object (e.g., LatentDirichletAllocation, NMF)\n", 89 | " The trained topic model. It should have a components_ attribute.\n", 90 | " feature_names : array-like of strings\n", 91 | " The names of each token, as a list or array.\n", 92 | " n_top_words : int\n", 93 | " The number of words to plot for each topic.\n", 94 | " n_row : int\n", 95 | " The number of rows in the plot.\n", 96 | " n_col : int\n", 97 | " The number of columns in the plot.\n", 98 | " normalize : boolean\n", 99 | " If True, normalizes the components so that they sum to 1 along samples.\n", 100 | " \"\"\"\n", 101 | " # Create figure\n", 102 | " fig, axes = plt.subplots(n_row, n_col, figsize=(3 * n_col, 5 * n_row), sharex=True)\n", 103 | " axes = axes.flatten()\n", 104 | " components = model.components_\n", 105 | " # Normalize components, if necessary\n", 106 | " if normalize:\n", 107 | " components = components / components.sum(axis=1)[:, np.newaxis]\n", 108 | " # Iterate over each topic\n", 109 | " for topic_idx, topic in enumerate(components):\n", 110 | " # Obtain the top words for each topic\n", 111 | " top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]\n", 112 | " # Get the token names\n", 113 | " top_features = [feature_names[i] for i in top_features_ind]\n", 114 | " # Get their values\n", 115 | " weights = topic[top_features_ind]\n", 116 | "\n", 117 | " # Plot the token weights as a bar plot\n", 118 | " ax = axes[topic_idx]\n", 119 | " ax.barh(top_features, weights, height=0.7)\n", 120 | " ax.set_title(f\"Topic {topic_idx +1}\", fontdict={\"fontsize\": 20})\n", 121 | " ax.invert_yaxis()\n", 122 | " ax.tick_params(axis=\"both\", which=\"major\", labelsize=20)\n", 123 | " \n", 124 | " # Customize plot\n", 125 | " for i in \"top right left\".split():\n", 126 | " ax.spines[i].set_visible(False)\n", 127 | "\n", 128 | " plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)\n", 129 | "\n", 130 | " return fig, axes" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "id": "7a0c5325-4de3-4a67-890c-aecee563fbd1", 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [ 140 | "n_tokens = 1000\n", 141 | "\n", 142 | "vectorizer = TfidfVectorizer(\n", 143 | " max_df=0.95,\n", 144 | " min_df=2,\n", 145 | " max_features=n_tokens,\n", 146 | " stop_words=\"english\")\n", 147 | "\n", 148 | "# Perform vectorizing\n", 149 | "tfidf = vectorizer.fit_transform(data)\n", 150 | "tokens = vectorizer.get_feature_names_out()" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "id": "05a34af7-b163-4a79-bbc6-bda47f2e9962", 156 | "metadata": {}, 157 | "source": [ 158 | "---\n", 159 | "\n", 160 | "### Challenge 1: Exploring Hyperparameters in NMF\n", 161 | "\n", 162 | "The choice of 10 components was somewhat arbitrary. It was something we had to do before we could proceed with fitting the model to the data. This is what's known as a *hyperparameter*. There are other hyperparameters in the `NMF`. For example, the `alpha` values specifies to what degree we should force values to be set equal to zero.\n", 163 | "\n", 164 | "Try fitting the NMF with other variations of hyperparameters, and plot the resulting topics using the `plot_top_words` function. What do you notice?\n", 165 | "\n", 166 | "---" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "id": "e87829cf-bd9d-457f-b276-e6d2fae34633", 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "# Make alpha very large\n", 177 | "n_components = 10\n", 178 | "random_state = 1\n", 179 | "\n", 180 | "nmf = NMF(\n", 181 | " n_components=n_components,\n", 182 | " random_state=random_state,\n", 183 | " alpha=100,\n", 184 | " l1_ratio=0.5,\n", 185 | " init='nndsvda',\n", 186 | " max_iter=500).fit(tfidf)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "id": "87955998-99b3-412b-acd5-baba58e1c8c6", 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "# Notice how everything is now zero\n", 197 | "fig, axes = plot_top_words(nmf, tokens)\n", 198 | "plt.show()" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "id": "950b585d-0f27-401d-9e41-c3a0e085bb6d", 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "# Increase number of topics\n", 209 | "n_components = 20\n", 210 | "random_state = 1\n", 211 | "\n", 212 | "nmf = NMF(\n", 213 | " n_components=n_components,\n", 214 | " random_state=random_state,\n", 215 | " alpha=0.1,\n", 216 | " l1_ratio=0.5,\n", 217 | " init='nndsvda',\n", 218 | " max_iter=500).fit(tfidf)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "id": "ae70ebac-fefd-4303-b8e2-c90d07d936a4", 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "# We need to change number of rows\n", 229 | "fig, axes = plot_top_words(nmf, tokens, n_row=4)\n", 230 | "plt.show()" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": null, 236 | "id": "8afd7135-243b-497f-a662-da08a43b6be8", 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "# Decrease number of topics\n", 241 | "n_components = 5\n", 242 | "random_state = 1\n", 243 | "\n", 244 | "nmf = NMF(\n", 245 | " n_components=n_components,\n", 246 | " random_state=random_state,\n", 247 | " alpha=0.1,\n", 248 | " l1_ratio=0.5,\n", 249 | " init='nndsvda',\n", 250 | " max_iter=500).fit(tfidf)" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": null, 256 | "id": "c0ba121b-d124-4fd9-a863-a32649f3b970", 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "# We need to change number of rows\n", 261 | "fig, axes = plot_top_words(nmf, tokens, n_row=1)\n", 262 | "plt.show()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "id": "369f49ed-d1a2-4f84-9dc6-171481d1faf7", 268 | "metadata": {}, 269 | "source": [ 270 | "---\n", 271 | "\n", 272 | "### Challenge 2: Exploring Hyperparameters in LDA\n", 273 | "\n", 274 | "As in the case of NMF, try performing LDA with other variations of hyperparameters, and plot the resulting topics using the `plot_top_words` function. Use the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html) as a guide to choose different hyperparameters.\n", 275 | "\n", 276 | "---" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": null, 282 | "id": "27bc21ac-f5f7-4a90-8a33-b387a6703d29", 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "# Use a CountVectorizer for LDA\n", 287 | "n_tokens = 1000\n", 288 | "count_vectorizer = CountVectorizer(\n", 289 | " max_df=0.95,\n", 290 | " min_df=2,\n", 291 | " max_features=n_tokens,\n", 292 | " stop_words=\"english\")\n", 293 | "counts = count_vectorizer.fit_transform(data)\n", 294 | "tokens = count_vectorizer.get_feature_names_out()" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "id": "bc06683e-0cc6-4f9e-87be-c12d1f10fb24", 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "# Increase number of components\n", 305 | "n_components = 20\n", 306 | "random_state = 0\n", 307 | "\n", 308 | "lda = LatentDirichletAllocation(\n", 309 | " n_components=n_components,\n", 310 | " max_iter=5,\n", 311 | " learning_method=\"online\",\n", 312 | " learning_offset=50.0, \n", 313 | " random_state=random_state).fit(counts)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "id": "4ab59bec-7f6d-4f8a-a28e-abe753a3e46b", 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "# Change number of rows\n", 324 | "fig, axes = plot_top_words(lda, tokens, normalize=True, n_row=4)\n", 325 | "plt.show()" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": null, 331 | "id": "c6ea3f37-0fb5-4d79-9991-d4c4d51977d8", 332 | "metadata": {}, 333 | "outputs": [], 334 | "source": [ 335 | "# Decrease number of components\n", 336 | "n_components = 5\n", 337 | "random_state = 0\n", 338 | "\n", 339 | "lda = LatentDirichletAllocation(\n", 340 | " n_components=n_components,\n", 341 | " max_iter=5,\n", 342 | " learning_method=\"online\",\n", 343 | " learning_offset=50.0, \n", 344 | " random_state=random_state).fit(counts)" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "id": "d6ea7441-f725-4e34-8c0f-8aefce3cc3bd", 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "# Change number of rows\n", 355 | "fig, axes = plot_top_words(lda, tokens, normalize=True, n_row=1)\n", 356 | "plt.show()" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "id": "8f15cf1d-e256-4633-b93a-62f99ed94cb3", 362 | "metadata": {}, 363 | "source": [ 364 | "---\n", 365 | "\n", 366 | "### Challenge 3: Finding Similar Documents\n", 367 | "\n", 368 | "Calculate the cosine similarity between all pairs of documents, and find the two documents whose cosine similarity is the highest. What are these documents? Do they seem similar?\n", 369 | "\n", 370 | "---" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "id": "e2438ec8-cf86-4436-9d15-859bf0953cbe", 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "n_components = 10\n", 381 | "random_state = 0\n", 382 | "\n", 383 | "lda = LatentDirichletAllocation(\n", 384 | " n_components=n_components,\n", 385 | " max_iter=5,\n", 386 | " learning_method=\"online\", # Use when dataset is large\n", 387 | " learning_offset=50.0, \n", 388 | " random_state=random_state).fit(counts)" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "id": "cbf0bf8b-7c9b-4761-9156-91da3a4d3319", 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "topic_representation = lda.transform(counts)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "id": "e0e15a71-1225-448e-93af-ed60db6392cf", 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "# Calculate similarities\n", 409 | "similarities = cosine_similarity(topic_representation)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "id": "61161d45-3b12-4a47-b59b-d9bc43323a23", 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "# Double check the shape\n", 420 | "similarities.shape" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": null, 426 | "id": "e4390c54-5105-4984-8b34-cd463cadac5a", 427 | "metadata": {}, 428 | "outputs": [], 429 | "source": [ 430 | "# The diagonal of this matrix is all ones.\n", 431 | "# We want to zero this out in order to find the highest similarities.\n", 432 | "np.fill_diagonal(similarities, 0)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": null, 438 | "id": "d2cdcdbc-8813-43bd-93f4-4ee0969adedd", 439 | "metadata": {}, 440 | "outputs": [], 441 | "source": [ 442 | "# Now, find the highest value\n", 443 | "# We need two functions: np.argmax, and np.unravel_index\n", 444 | "idx1, idx2 = np.unravel_index(np.argmax(similarities), similarities.shape)" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "id": "202ce399-0a54-49fd-b597-779eb2af236d", 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "# What is the similarity?\n", 455 | "similarities[idx1, idx2]" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "id": "e31c1e48-8d3a-4931-b0c2-496c71f00036", 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "# What are the documents?\n", 466 | "print(data[idx1])\n", 467 | "print(data[idx2])" 468 | ] 469 | } 470 | ], 471 | "metadata": { 472 | "kernelspec": { 473 | "display_name": "Python 3 (ipykernel)", 474 | "language": "python", 475 | "name": "python3" 476 | }, 477 | "language_info": { 478 | "codemirror_mode": { 479 | "name": "ipython", 480 | "version": 3 481 | }, 482 | "file_extension": ".py", 483 | "mimetype": "text/x-python", 484 | "name": "python", 485 | "nbconvert_exporter": "python", 486 | "pygments_lexer": "ipython3", 487 | "version": "3.8.12" 488 | } 489 | }, 490 | "nbformat": 4, 491 | "nbformat_minor": 5 492 | } 493 | --------------------------------------------------------------------------------