├── 2020 ├── README_2020.md ├── data │ ├── data.md │ ├── download_coca.sh │ ├── download_hum19uk.sh │ ├── numerical │ │ └── pandas_demo.csv │ ├── tokenize │ │ ├── challenge.txt │ │ └── simple.txt │ ├── wikipedia │ │ ├── cologne.txt │ │ ├── linguistics.txt │ │ └── python.txt │ └── xml │ │ ├── bnc_style.xml │ │ └── xpath_example.xml ├── exercises │ ├── Exercises 1-3.docx │ ├── Exercises 1-3.pdf │ ├── Exercises 4-5.docx │ ├── Exercises 4-5.pdf │ ├── Exercises 6-7.docx │ ├── Exercises 6-7.pdf │ ├── Exercises 8-16.docx │ ├── Exercises 8-16.pdf │ ├── Playground.ipynb │ ├── Solutions_Exercises_1_3.ipynb │ ├── Solutions_Exercises_4_5.ipynb │ ├── Solutions_Exercises_6_7.ipynb │ ├── Solutions_Exercises_8_16.ipynb │ └── Solutions_Exercises_8_16_no_output.ipynb ├── notebooks │ ├── 00_Python_Programming_for_Absolute_Beginners.ipynb │ ├── 01_The_Pizza_Problem.ipynb │ └── 02_Working_with_Files_Texts_and_Regular_Expressions.ipynb ├── scripts │ ├── count.py │ ├── helloworld.py │ ├── my_functions.py │ └── use_functions.py ├── slides │ ├── 00 - Python Programming for Absolute Beginners.pdf │ ├── 00 - Python Programming for Absolute Beginners.pptx │ ├── 01 - PizzaProblem.png │ ├── 01 - The Pizza Problem.pdf │ ├── 01 - The Pizza Problem.pptx │ ├── 02 - Working with Files, Texts, and Regular Expressions.pdf │ ├── 02 - Working with Files, Texts, and Regular Expressions.pptx │ ├── 03 - Python for (Corpus) Linguists.pdf │ ├── 03 - Python for (Corpus) Linguists.pptx │ ├── 04 - Summary and Resources.pdf │ ├── 04 - Summary and Resources.pptx │ ├── 05 - Setting Up Your Development Environment.pdf │ └── 05 - Setting Up Your Development Environment.pptx └── youtube-video-descriptions.md ├── 2021 ├── data │ ├── corpora │ │ └── .gitkeep │ ├── data.md │ └── download_sherlockholmes.sh ├── exercises │ ├── Additional_Exercises_Frequency_Distribution.ipynb │ ├── Additional_Exercises_RegEx.ipynb │ ├── Additional_Exercises_Solutions_Frequency_Distribution.ipynb │ ├── Additional_Exercises_Solutions_RegEx.ipynb │ ├── Exercises 8-17.docx │ ├── Exercises 8-17.pdf │ └── Exercises_8_to_17.ipynb ├── learning-path-beginner-long.md ├── learning-path-beginner-short.md ├── learning-path-experienced.md ├── notebooks │ ├── .gitkeep │ └── 03_New_Tools_and_Syntax.ipynb ├── scripts │ └── my_functions.R ├── slides │ ├── 03 - Python for (Corpus) Linguists.pdf │ ├── 03 - Python for (Corpus) Linguists.pptx │ ├── 06 - Google Colab.pdf │ ├── 06 - Google Colab.pptx │ └── Additional Exercises.pptx └── youtube-video-descriptions.md ├── 2022 ├── exercises │ ├── Exercises_8_to_17.ipynb │ └── Solutions_Exercises_8_17.ipynb ├── slides │ ├── 03 - Python for (Corpus) Linguists.pdf │ └── 03 - Python for (Corpus) Linguists.pptx └── youtube-video-descriptions.md ├── .gitignore ├── Bonus Notebooks ├── Pizza_Problem_Dataclass_Solution.ipynb ├── Understanding_Classes_and_Objects.ipynb └── Working_with_R_in_Python.ipynb ├── Coding_Style_Guide.md ├── Command_Line_Primer.md ├── Commenting_in_Python.md ├── LICENSE.md ├── Links_to_Resources.md ├── Markdown_Primer.md ├── README.md └── images ├── banner.png ├── lp-experienced.svg ├── lp-long.svg └── lp-short.svg /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Misc 132 | TODO.txt 133 | 134 | # Microsoft Office Temp 135 | ~$*.pptx 136 | ~$*.docx -------------------------------------------------------------------------------- /2020/README_2020.md: -------------------------------------------------------------------------------- 1 | # Python Programming for Linguists (2020) 2 | 3 | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IngoKl/python-programming-for-linguists/) 4 | 5 | ![Python Programming for Linguists](https://github.com/IngoKl/python-programming-for-linguists/blob/main/banner.png) 6 | 7 | --- 8 | 9 | **Live Session**: ~~The [live session](https://www.twitch.tv/ingokl) will be on **January 15th, 2021** at **18:00 CET**. ([Twitch](https://www.twitch.tv/ingokl))~~ 10 | 11 | [Edited Recording of the 2020/2021 Live Session](https://www.youtube.com/watch?v=70g9oeclNac) 12 | 13 | --- 14 | 15 | Welcome to this repository for the "Python Programming for Linguists" workshop. 16 | 17 | In this workshop, consisting of five videos, several exercises, and a live session, you will be introduced to Python. After a short general introduction to programming as well as Python, we will utilize Python to solve several (corpus) linguistic exercises. 18 | 19 | This workshop is specifically targeted towards people who have **no prior experience programming**. While this workshop is not intended to make you a programmer, you will gain a fundamental understanding of how programming works and how to proceed should you want to deepen your knowledge and skills. Also, by looking at various example tasks that are commonly solved using existing software, we will try deepen our understanding of how commonly used tools work under the hood. 20 | 21 | Please **be aware** that this workshop was specifically designed as **a first introduction to programming for non-coders and linguists** and not as a fully-fledged Python course. Therefore, we will take some shortcuts, disobey some best practices, and hide away quite a few of the underlying complexities. If you are interested in a more thorough introduction or want to deepen your already existing knowledge, please refer to the final video in which I present many great resources. 22 | 23 | This workshop is inspired by workshops I held at [35c3](https://events.ccc.de/congress/2018/wiki/index.php/Session:(Python)_Programming_for_Absolute_Beginners) and [36c3 a](https://events.ccc.de/congress/2019/wiki/index.php/Session:Python_Programming_for_Absolute_Beginners)/[36c3 b](https://events.ccc.de/congress/2019/wiki/index.php/Session:Introduction_to_Natural_Language_Processing). 24 | 25 | ## Learning Objectives 26 | 27 | After completing this workshop, you will be able to ... 28 | 29 | * describe what programming essentially is about. 30 | * name and describe some basic programming terminology. 31 | * model simple problems in terms of data structures and basic algorithms. 32 | * write basic scripts in Python in order to solve specific problems. 33 | * utilize third-party libraries such as [NLTK](https://www.nltk.org), [spaCy](https://spacy.io), and [TextDirectory](https://github.com/IngoKl/textdirectory). 34 | * construct and apply basic regular expressions. 35 | * utilize Python for text manipulation. 36 | * utilize Python to perform concordance and frequency analysis. 37 | * automatically annotate texts (PoS, Universal Dependencies, NER) using spaCy. 38 | * scrape web data in order to build corpora (Web as Corpus) using Python. 39 | * compute basic statistics using Python. 40 | 41 | ## Workshop Outline 42 | 43 | Of course, you can use the materials here as you see fit. However, ideally, you take this workshop in the following order: 44 | 45 | 1. Watch the video ["Python Programming for Absolute Beginners"](https://www.youtube.com/watch?v=4UnF45lniyY). 46 | 2. Familiarize yourself with Python notebooks and try to solve [exercises 1 to 3](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%201-3.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_1_3.ipynb)). 47 | 3. Watch the video ["The Pizza Problem"](https://www.youtube.com/watch?v=g9tOyVI5B3E). 48 | 4. Try to solve [exercises 4 and 5](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%204-5.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_4_5.ipynb)). 49 | 5. Watch the video ["Working with Files, Texts, and Regular Expressions"](https://www.youtube.com/watch?v=y37_JvSY-GM). 50 | 6. Try to solve [exercises 6 and 7](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%206-7.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_6_7.ipynb)) 51 | 7. Think about [exercises 8 to 16]((https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%208-16.pdf)) and do some research on how you could approach these problems. 52 | 8. Attend the live session or watch the [recording](https://www.youtube.com/watch?v=70g9oeclNac) - Python for (Corpus) Linguists + Exercises 8 to 16. 53 | 9. Watch the video ["Summary and Resources"](https://www.youtube.com/watch?v=ajKqESDmrKc). 54 | 55 | Please note that for each exercise, you will find **solutions in this repository**. Don't feel bad if you cannot immediately solve the exercises - the solutions are there to help you. Of course, feel free to take apart these suggested solutions and play with them. 56 | 57 | ## Coding Along 58 | 59 | I want to strongly encourage you to code along and to experiment with the exercises. The easiest way of doing this is to use *Google Colab* (see above). In order to do this, you will need a Google Account. 60 | 61 | If you do not want to rely on Google, you can also set up your own local development environment. For a tutorial on how to do this on Windows, have a look at the video ["Setting Up Your Development Environment (Windows)"](https://www.youtube.com/watch?v=xrXEouns3fg). 62 | 63 | The videos are intended to be paused from time to time. Do not feel forced to watch through a whole video before playing with the code :). 64 | 65 | ## Live Session 66 | 67 | The live session will take place on **January 15th, 2021** at **18:00 CET** on [Twitch](https://www.twitch.tv/ingokl). 68 | 69 | The **recording** of the live session has been edited and slightly polished. A few unnecessary parts have been cut and slides, instead of impromptu drawings, have been added. Even if you have attended the live session, I would reccomend having a look at the recording to revisit the exercises. 70 | 71 | [Edited Recording of the 2020/2021 Live Session](https://www.youtube.com/watch?v=70g9oeclNac) 72 | 73 | ## Videos and Recordings (2020) 74 | 75 | All of these videos are currently hosted on YouTube ([YouTube Playlist](https://www.youtube.com/playlist?list=PLG6oHk0SZfBxRIegm0QvzDvmumma7grp5)). 76 | 77 | * [00 - Python Programming for Absolute Beginners](https://www.youtube.com/watch?v=4UnF45lniyY) 78 | * [01 - The Pizza Problem](https://www.youtube.com/watch?v=g9tOyVI5B3E) 79 | * [02 - Working with Files, Texts, and Regular Expressions](https://www.youtube.com/watch?v=y37_JvSY-GM) 80 | * [03 - Python for (Corpus) Linguists / Exercises 8-16](https://www.youtube.com/watch?v=70g9oeclNac) (This is an *edited and slightly polished* recording of the 2020/2021 live session) 81 | * [04 - Summary and Resources](https://www.youtube.com/watch?v=ajKqESDmrKc) 82 | * [05 - Setting Up Your Development Environment (Windows)](https://www.youtube.com/watch?v=xrXEouns3fg) (Alternative to using *Google Colab*. Please watch [video 00](https://www.youtube.com/watch?v=4UnF45lniyY) first in any case.) 83 | 84 | ## Exercises (2020) 85 | 86 | This workshop, next to the videos and livestream, has 16 exercises. You can find them, as well as the solutions, [here](https://github.com/IngoKl/python-programming-for-linguists/tree/main/2020/exercises). 87 | 88 | * [Exercises 1-3](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%201-3.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_1_3.ipynb)) 89 | * [Exercises 4-5](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%204-5.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_4_5.ipynb)) 90 | * [Exercises 6-7](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%206-7.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_6_7.ipynb)) 91 | * [Exercises 8-16](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%208-16.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_8_16.ipynb)) (This notebook is a polished version of the one created during the live session) 92 | * [Bonus: Understanding Classes and Objects](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Bonus%20Notebooks/Understanding_Classes_and_Objects.ipynb) 93 | 94 | ## Slides 95 | 96 | All of the slides (in both `.pptx` and `.pdf`) [are available as well](https://github.com/IngoKl/python-programming-for-linguists/tree/main/2020/slides). 97 | 98 | ## Bonus Notebooks 99 | 100 | Aside from the main material, there are also a few advanced [bonus notebooks](https://github.com/IngoKl/python-programming-for-linguists/tree/main/Bonus%20Notebooks) in this repository. Have a look at them to see more advanced and/or alternative solutions to some of the problems discussed in the workshop. 101 | 102 | * [Pizza Problem: Dataclass Solution](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Bonus%20Notebooks/Pizza_Problem_Dataclass_Solution.ipynb) 103 | 104 | ## Helpful Additional Material 105 | 106 | * [Command Line Primer](https://github.com/IngoKl/python-programming-for-linguists/tree/main/Command_Line_Primer.md) 107 | * [Commenting in Python](https://github.com/IngoKl/python-programming-for-linguists/tree/main/Commenting_in_python.md) 108 | * [Links to Resources](https://github.com/IngoKl/python-programming-for-linguists/tree/main/Links_to_Resources.md) 109 | * [Video: A RegEx Primer for Linguistics](https://www.youtube.com/watch?v=p7-QkwOU9RY) 110 | * [Video: A Git Primer for Linguistics](https://www.youtube.com/watch?v=7EETKVp20y4) 111 | 112 | ## License 113 | 114 | You are (relatively) free to use all of these materials as you like. 115 | 116 | * The code (notebooks and scripts) is licensed under the [MIT License](https://github.com/IngoKl/python-programming-for-linguists/blob/main/LICENSE.md). 117 | * The slides, videos, and exercises are licensed under a [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0) license. 118 | -------------------------------------------------------------------------------- /2020/data/data.md: -------------------------------------------------------------------------------- 1 | # Data 2 | 3 | ## wikipedia 4 | 5 | These are modified/cleaned excerpts from Wikipedia (English) with each paragraph on one line. 6 | 7 | * python.txt - excerpt from 8 | * cologne.txt - excerpt from 9 | * linguistics.txt - excerpt from 10 | 11 | Wikipedia texts are available under the [Creative Commons Attribution-ShareAlike License](https://en.wikipedia.org/wiki/Wikipedia:Text_of_Creative_Commons_Attribution-ShareAlike_3.0_Unported_License). 12 | 13 | ## tokenize 14 | 15 | This folder contains two files/texts that can be tokenized. 16 | 17 | * simple.txt contains one sentence that can be easily tokenized using a whitespace tokenizer. 18 | * challenge.txt contains two sentences that pose some challenges for a tokenizer. 19 | 20 | ## numerical 21 | 22 | This folder contains numerical data to experiment with libraries such as `Pandas`. 23 | 24 | * pandas_demo.csv is a comma-separated values file (separator: `,`) that contains four rows of numerical demo data. 25 | 26 | ## Downloadable Data 27 | 28 | ### Corpus of Contemporary American English (COCA) 29 | 30 | The [COCA](https://www.english-corpora.org/coca/) is a large, genre-balanced corpus of American English. It is being compiled and developed by Mark Davies and available via [english-corpora.org](https://www.english-corpora.org/). 31 | 32 | Using the script below, you can download the freely available linear text sampler of the COCA. 33 | 34 | ```bash 35 | !git clone https://github.com/IngoKl/python-programming-for-linguists 36 | !cd python-programming-for-linguists/2020/data && sh download_coca.sh 37 | ``` 38 | 39 | The data, as individual text files, then will be available in `data/corpora/coca`. 40 | 41 | ### HUM19UK Corpus 42 | 43 | The [HUM19UK Corpus](https://www.linguisticsathuddersfield.com/hum19uk-corpus) is a corpus of 100 19th century British novels. It contains 100 complete novels written by 100 authors. It has been compiled by Fransina Stradling, Brian Walker, Dan McIntyre, Elliott Land, Hazel Price, and Michael Burke. 44 | 45 | Using the script below, you can download the corpus from [linguisticsathuddersfield.com/hum19uk-corpus](https://www.linguisticsathuddersfield.com/hum19uk-corpus). 46 | 47 | ```bash 48 | !git clone https://github.com/IngoKl/python-programming-for-linguists 49 | !cd python-programming-for-linguists/2020/data && sh download_hum19uk.sh 50 | ``` 51 | 52 | The data, as individual text files, then will be available in `data/corpora/hum19uk`. 53 | -------------------------------------------------------------------------------- /2020/data/download_coca.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir -p ./corpora/coca 4 | mkdir temp 5 | 6 | cd temp 7 | wget https://www.corpusdata.org/coca/samples/coca-samples-text.zip 8 | unzip coca-samples-text.zip -d ./../corpora/coca 9 | 10 | cd .. 11 | rm -r temp -------------------------------------------------------------------------------- /2020/data/download_hum19uk.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir -p ./corpora/hum19uk 4 | mkdir temp 5 | 6 | cd temp 7 | #wget https://www.linguisticsathuddersfield.com/s/1800-1849.zip 8 | wget https://teaching.ingokleiber.de/hum19uk/1800-1849.zip 9 | #wget https://www.linguisticsathuddersfield.com/s/1850-1899.zip 10 | wget https://teaching.ingokleiber.de/hum19uk/1850-1899.zip 11 | unzip '*.zip' -d ./hum19uk 12 | 13 | cd hum19uk 14 | unzip '*.zip' -d ./../../corpora/hum19uk 15 | 16 | cd ../.. 17 | rm -r temp -------------------------------------------------------------------------------- /2020/data/numerical/pandas_demo.csv: -------------------------------------------------------------------------------- 1 | Document,Tokens,Sentiment 2 | 0,1000,0.2 3 | 1,2000,0.3 4 | 2,3000,0.8 5 | 3,3000 -------------------------------------------------------------------------------- /2020/data/tokenize/challenge.txt: -------------------------------------------------------------------------------- 1 | Sue owed Ms. O'Neil $10. Unfortunately, she didn't have the money. -------------------------------------------------------------------------------- /2020/data/tokenize/simple.txt: -------------------------------------------------------------------------------- 1 | The black cat chased the mouse. -------------------------------------------------------------------------------- /2020/data/wikipedia/cologne.txt: -------------------------------------------------------------------------------- 1 | Cologne is the largest city of Germany's most populous federal state of North Rhine-Westphalia and the fourth-most populous city in Germany. With slightly over a million inhabitants (1.09 million) within its city boundaries, Cologne is the largest city on the Rhine and also the most populous city both of the Rhine-Ruhr Metropolitan Region, which is Germany's largest and one of Europe's major metropolitan areas, and of the Rhineland. Centered on the left bank of the Rhine, Cologne is about 45 kilometres (28 mi) southeast of North Rhine-Westphalia's capital of Düsseldorf and 25 kilometres (16 mi) northwest of Bonn. It is the largest city in the Central Franconian and Ripuarian dialect areas. 2 | The city's Cologne Cathedral (Kölner Dom) is the seat of the Catholic Archbishop of Cologne. There are many institutions of higher education in the city, most notably the University of Cologne (Universität zu Köln), one of Europe's oldest and largest universities, the Technical University of Cologne (Technische Hochschule Köln), Germany's largest university of applied sciences, and the German Sport University Cologne (Deutsche Sporthochschule Köln), Germany's only sport university. Cologne Bonn Airport (Flughafen Köln/Bonn) is Germany's seventh-largest airport and lies in the southeast of the city. The main airport for the Rhine-Ruhr region is Düsseldorf Airport. 3 | Cologne was founded and established in Ubii territory in the 1st century AD as the Roman Colonia Claudia Ara Agrippinensium, the first word of which is the origin of its name. An alternative Latin name of the settlement is Augusta Ubiorum, after the Ubii. "Cologne", the French version of the city's name, has become standard in English as well. Cologne functioned as the capital of the Roman province of Germania Inferior and as the headquarters of the Roman military in the region until occupied by the Franks in 462. During the Middle Ages the city flourished as being located on one of the most important major trade routes between east and western Europe. Cologne was one of the leading members of the Hanseatic League and one of the largest cities north of the Alps in medieval and Renaissance times. Prior to World War II, the city had undergone several occupations by the French and also by the British (1918–1926). Cologne was one of the most heavily bombed cities in Germany during World War II, with the Royal Air Force (RAF) dropping 34,711 long tons (35,268 tonnes) of bombs on the city. The bombing reduced the population by 95%, mainly due to evacuation, and destroyed almost the entire city centre. With the intention of restoring as many historic landmarks as possible, the postwar rebuilding has resulted in a very mixed and unique cityscape. 4 | Cologne is a major cultural centre for the Rhineland; it hosts more than 30 museums and hundreds of galleries. Exhibitions range from local ancient Roman archeological sites to contemporary graphics and sculpture. The Cologne Trade Fair hosts a number of trade shows such as Art Cologne, imm Cologne, Gamescom, and the Photokina. -------------------------------------------------------------------------------- /2020/data/wikipedia/linguistics.txt: -------------------------------------------------------------------------------- 1 | Linguistics is the scientific study of language. It involves an analysis of language form, language meaning, and language in context, as well as an analysis of the social, cultural, historical, and political factors that influence language. 2 | Linguists traditionally analyse human language by observing the relationship between sound and meaning. Meaning can be studied in its directly spoken or written form through the field of semantics, as well as in its indirect form through body language and gestures under the discipline of pragmatics. Each speech sound particle is called a phoneme. How these phonemes are organised to convey meaning depends on various linguistic patterns and structures that theoretical linguists describe and analyse. 3 | Some of these patterns of sound and meaning are found in the study of morphology (concerning how words are formulated through "morphemes"), syntax (how sentences are logically structured), and phonology (the study of sound patterns). The emergence of historical and evolutionary linguistics has also led to a greater focus over studying how languages change and grow, particularly over an extended period of time. Sociolinguists also study how language develops among different communities through dialects, and how each language changes, grows, and varies from person to person and group to group. 4 | Macrolinguistic concepts include the study of narrative theory, stylistics, discourse analysis, and semiotics. Microlinguistic concepts, on the other hand, involve the analysis of grammar, speech sounds, palaeographic symbols, connotation, and logical references, all of which can be applied to lexicography, editing, language documentation, translation, as well as speech-language pathology (a corrective method to cure phonetic disabilities and disfunctions). 5 | The earliest activities in the documentation and description of language have been attributed to the 6th-century-BC Indian grammarian Pāṇini who wrote a formal description of the Sanskrit language in his Aṣṭādhyāyī. Today, modern-day theories on grammar employ many of the principles that were laid down back then. -------------------------------------------------------------------------------- /2020/data/wikipedia/python.txt: -------------------------------------------------------------------------------- 1 | Python is an interpreted, high-level and general-purpose programming language. Python's design philosophy emphasizes code readability with its notable use of significant whitespace. Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small and large-scale projects. 2 | Python is dynamically typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly, procedural), object-oriented, and functional programming. Python is often described as a "batteries included" language due to its comprehensive standard library. 3 | Python was created in the late 1980s, and first released in 1991, by Guido van Rossum as a successor to the ABC programming language. Python 2.0, released in 2000, introduced new features, such as list comprehensions, and a garbage collection system with reference counting, and was discontinued with version 2.7 in 2020. Python 3.0, released in 2008, was a major revision of the language that is not completely backward-compatible and much Python 2 code does not run unmodified on Python 3. With Python 2's end-of-life, only Python 3.6.x and later are supported, with older versions still supporting e.g. Windows 7 (and old installers not restricted to 64-bit Windows). 4 | Python interpreters are supported for mainstream operating systems and available for a few more (and in the past supported many more). A global community of programmers develops and maintains CPython, a free and open-source reference implementation. A non-profit organization, the Python Software Foundation, manages and directs resources for Python and CPython development. 5 | As of December 2020 Python ranked third in TIOBE’s index of most popular programming languages, behind C and Java. -------------------------------------------------------------------------------- /2020/data/xml/bnc_style.xml: -------------------------------------------------------------------------------- 1 | 2 | I 3 | have 4 | bought 5 | a 6 | car 7 | . 8 | -------------------------------------------------------------------------------- /2020/data/xml/xpath_example.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | The 5 | flower 6 | was 7 | red. 8 | 9 | 10 | It 11 | smelled 12 | of 13 | summer. 14 | 15 | 16 | 17 | 18 | She 19 | enjoyed 20 | the 21 | trip. 22 | 23 | 24 | They 25 | took 26 | a 27 | bus. 28 | 29 | 30 | -------------------------------------------------------------------------------- /2020/exercises/Exercises 1-3.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/exercises/Exercises 1-3.docx -------------------------------------------------------------------------------- /2020/exercises/Exercises 1-3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/exercises/Exercises 1-3.pdf -------------------------------------------------------------------------------- /2020/exercises/Exercises 4-5.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/exercises/Exercises 4-5.docx -------------------------------------------------------------------------------- /2020/exercises/Exercises 4-5.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/exercises/Exercises 4-5.pdf -------------------------------------------------------------------------------- /2020/exercises/Exercises 6-7.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/exercises/Exercises 6-7.docx -------------------------------------------------------------------------------- /2020/exercises/Exercises 6-7.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/exercises/Exercises 6-7.pdf -------------------------------------------------------------------------------- /2020/exercises/Exercises 8-16.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/exercises/Exercises 8-16.docx -------------------------------------------------------------------------------- /2020/exercises/Exercises 8-16.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/exercises/Exercises 8-16.pdf -------------------------------------------------------------------------------- /2020/exercises/Playground.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Playground.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "UV91AtD1i95-" 20 | }, 21 | "source": [ 22 | "# Python Programming for Linguists\r\n", 23 | "## Playground" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "metadata": { 29 | "colab": { 30 | "base_uri": "https://localhost:8080/" 31 | }, 32 | "id": "ytZFXMEJjD05", 33 | "outputId": "5cb6e274-115d-4b8a-9fa3-c2edce1bb8c1" 34 | }, 35 | "source": [ 36 | "print('Have fun!')" 37 | ], 38 | "execution_count": null, 39 | "outputs": [ 40 | { 41 | "output_type": "stream", 42 | "text": [ 43 | "Have fun!\n" 44 | ], 45 | "name": "stdout" 46 | } 47 | ] 48 | } 49 | ] 50 | } -------------------------------------------------------------------------------- /2020/exercises/Solutions_Exercises_1_3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Solutions - Exercises 1 - 3.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "4lT71anpPzb4" 20 | }, 21 | "source": [ 22 | "# Exercises 1 - 3" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "id": "PYbZO_PFP2sl" 29 | }, 30 | "source": [ 31 | "## Exercise 1 - Printing Your Name" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "metadata": { 37 | "colab": { 38 | "base_uri": "https://localhost:8080/" 39 | }, 40 | "id": "HO-YR2fURh9s", 41 | "outputId": "cc31fdbd-313a-4ecd-dcb5-aaa8ce8fa375" 42 | }, 43 | "source": [ 44 | "print('Sue')" 45 | ], 46 | "execution_count": 2, 47 | "outputs": [ 48 | { 49 | "output_type": "stream", 50 | "text": [ 51 | "Sue\n" 52 | ], 53 | "name": "stdout" 54 | } 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "metadata": { 60 | "colab": { 61 | "base_uri": "https://localhost:8080/" 62 | }, 63 | "id": "4rYwzY6PRlTv", 64 | "outputId": "5ee2fedb-18b8-4edb-f420-0154aae6729f" 65 | }, 66 | "source": [ 67 | "print('Sue'.upper())" 68 | ], 69 | "execution_count": 4, 70 | "outputs": [ 71 | { 72 | "output_type": "stream", 73 | "text": [ 74 | "SUE\n" 75 | ], 76 | "name": "stdout" 77 | } 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": { 83 | "id": "fVUYJrB4P4Kl" 84 | }, 85 | "source": [ 86 | "## Exercise 2 - Adding Variables" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "metadata": { 92 | "id": "dVhci8NWR5bC" 93 | }, 94 | "source": [ 95 | "name = 'Sue'\r\n", 96 | "age = 20" 97 | ], 98 | "execution_count": 6, 99 | "outputs": [] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "metadata": { 104 | "colab": { 105 | "base_uri": "https://localhost:8080/" 106 | }, 107 | "id": "lU-jv2tVR9R6", 108 | "outputId": "a26a868a-d585-4400-9ca1-c90493dc9441" 109 | }, 110 | "source": [ 111 | "print(f'{name} is {age}')" 112 | ], 113 | "execution_count": 7, 114 | "outputs": [ 115 | { 116 | "output_type": "stream", 117 | "text": [ 118 | "Sue is 20\n" 119 | ], 120 | "name": "stdout" 121 | } 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": { 127 | "id": "_9GipoF0SDpB" 128 | }, 129 | "source": [ 130 | "While `name` is a string, `age` is an integer. You cannot just use integers as if they were text. However, modern Python allows you to use so-called f-strings (as seen above) which easily allow you to print variables. Here, Python figures out how to represent the age as text automatically." 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "metadata": { 136 | "colab": { 137 | "base_uri": "https://localhost:8080/" 138 | }, 139 | "id": "Uwwr41O3SYBF", 140 | "outputId": "742286ea-d16f-4605-eeb2-9364fa6fab9f" 141 | }, 142 | "source": [ 143 | "type(age)" 144 | ], 145 | "execution_count": 9, 146 | "outputs": [ 147 | { 148 | "output_type": "execute_result", 149 | "data": { 150 | "text/plain": [ 151 | "int" 152 | ] 153 | }, 154 | "metadata": { 155 | "tags": [] 156 | }, 157 | "execution_count": 9 158 | } 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "colab": { 165 | "base_uri": "https://localhost:8080/" 166 | }, 167 | "id": "NqyY0y57Sp6t", 168 | "outputId": "b63c8ff7-e343-4593-8bab-916411315572" 169 | }, 170 | "source": [ 171 | "age_string = str(age)\r\n", 172 | "type(age_string)" 173 | ], 174 | "execution_count": 13, 175 | "outputs": [ 176 | { 177 | "output_type": "execute_result", 178 | "data": { 179 | "text/plain": [ 180 | "str" 181 | ] 182 | }, 183 | "metadata": { 184 | "tags": [] 185 | }, 186 | "execution_count": 13 187 | } 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": { 193 | "id": "90-2TPKgStIZ" 194 | }, 195 | "source": [ 196 | "Above you can see that we can use `type` to check of which type a variable is. In the following two lines, we are using `str` to transform `age` into a string. When using f-strings, this happens automatically." 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": { 202 | "id": "9OTC96SVP4S-" 203 | }, 204 | "source": [ 205 | "## Exercise 3 - Looping Numbers" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "metadata": { 211 | "colab": { 212 | "base_uri": "https://localhost:8080/" 213 | }, 214 | "id": "Iy9P9mxgTLa6", 215 | "outputId": "61482dc2-3747-416e-ab50-cad1a6076f6e" 216 | }, 217 | "source": [ 218 | "for number in range(50):\r\n", 219 | " if number < 25:\r\n", 220 | " print(f'{number} is smaller than 25')\r\n", 221 | " if number > 25:\r\n", 222 | " print(f'{number} is larger than 25')" 223 | ], 224 | "execution_count": 22, 225 | "outputs": [ 226 | { 227 | "output_type": "stream", 228 | "text": [ 229 | "0 is smaller than 25\n", 230 | "1 is smaller than 25\n", 231 | "2 is smaller than 25\n", 232 | "3 is smaller than 25\n", 233 | "4 is smaller than 25\n", 234 | "5 is smaller than 25\n", 235 | "6 is smaller than 25\n", 236 | "7 is smaller than 25\n", 237 | "8 is smaller than 25\n", 238 | "9 is smaller than 25\n", 239 | "10 is smaller than 25\n", 240 | "11 is smaller than 25\n", 241 | "12 is smaller than 25\n", 242 | "13 is smaller than 25\n", 243 | "14 is smaller than 25\n", 244 | "15 is smaller than 25\n", 245 | "16 is smaller than 25\n", 246 | "17 is smaller than 25\n", 247 | "18 is smaller than 25\n", 248 | "19 is smaller than 25\n", 249 | "20 is smaller than 25\n", 250 | "21 is smaller than 25\n", 251 | "22 is smaller than 25\n", 252 | "23 is smaller than 25\n", 253 | "24 is smaller than 25\n", 254 | "26 is larger than 25\n", 255 | "27 is larger than 25\n", 256 | "28 is larger than 25\n", 257 | "29 is larger than 25\n", 258 | "30 is larger than 25\n", 259 | "31 is larger than 25\n", 260 | "32 is larger than 25\n", 261 | "33 is larger than 25\n", 262 | "34 is larger than 25\n", 263 | "35 is larger than 25\n", 264 | "36 is larger than 25\n", 265 | "37 is larger than 25\n", 266 | "38 is larger than 25\n", 267 | "39 is larger than 25\n", 268 | "40 is larger than 25\n", 269 | "41 is larger than 25\n", 270 | "42 is larger than 25\n", 271 | "43 is larger than 25\n", 272 | "44 is larger than 25\n", 273 | "45 is larger than 25\n", 274 | "46 is larger than 25\n", 275 | "47 is larger than 25\n", 276 | "48 is larger than 25\n", 277 | "49 is larger than 25\n" 278 | ], 279 | "name": "stdout" 280 | } 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": { 286 | "id": "3Nju6_TXTpNY" 287 | }, 288 | "source": [ 289 | "If you look carefully, you will see that the number 25 is missing from our output. This happens because no condition meets 25." 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "metadata": { 295 | "colab": { 296 | "base_uri": "https://localhost:8080/" 297 | }, 298 | "id": "dhTh3rpOTwLE", 299 | "outputId": "8b18c0bd-504b-42ba-9b4a-bb5dd37ffe4d" 300 | }, 301 | "source": [ 302 | "for number in range(50):\r\n", 303 | " if number < 25:\r\n", 304 | " print(f'{number} is smaller than 25')\r\n", 305 | " if number > 25:\r\n", 306 | " print(f'{number} is larger than 25')\r\n", 307 | " if number == 25:\r\n", 308 | " print(f'{number} is equal to 25')" 309 | ], 310 | "execution_count": 24, 311 | "outputs": [ 312 | { 313 | "output_type": "stream", 314 | "text": [ 315 | "0 is smaller than 25\n", 316 | "1 is smaller than 25\n", 317 | "2 is smaller than 25\n", 318 | "3 is smaller than 25\n", 319 | "4 is smaller than 25\n", 320 | "5 is smaller than 25\n", 321 | "6 is smaller than 25\n", 322 | "7 is smaller than 25\n", 323 | "8 is smaller than 25\n", 324 | "9 is smaller than 25\n", 325 | "10 is smaller than 25\n", 326 | "11 is smaller than 25\n", 327 | "12 is smaller than 25\n", 328 | "13 is smaller than 25\n", 329 | "14 is smaller than 25\n", 330 | "15 is smaller than 25\n", 331 | "16 is smaller than 25\n", 332 | "17 is smaller than 25\n", 333 | "18 is smaller than 25\n", 334 | "19 is smaller than 25\n", 335 | "20 is smaller than 25\n", 336 | "21 is smaller than 25\n", 337 | "22 is smaller than 25\n", 338 | "23 is smaller than 25\n", 339 | "24 is smaller than 25\n", 340 | "25 is equal to 25\n", 341 | "26 is larger than 25\n", 342 | "27 is larger than 25\n", 343 | "28 is larger than 25\n", 344 | "29 is larger than 25\n", 345 | "30 is larger than 25\n", 346 | "31 is larger than 25\n", 347 | "32 is larger than 25\n", 348 | "33 is larger than 25\n", 349 | "34 is larger than 25\n", 350 | "35 is larger than 25\n", 351 | "36 is larger than 25\n", 352 | "37 is larger than 25\n", 353 | "38 is larger than 25\n", 354 | "39 is larger than 25\n", 355 | "40 is larger than 25\n", 356 | "41 is larger than 25\n", 357 | "42 is larger than 25\n", 358 | "43 is larger than 25\n", 359 | "44 is larger than 25\n", 360 | "45 is larger than 25\n", 361 | "46 is larger than 25\n", 362 | "47 is larger than 25\n", 363 | "48 is larger than 25\n", 364 | "49 is larger than 25\n" 365 | ], 366 | "name": "stdout" 367 | } 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": { 373 | "id": "izSIq763T3Ef" 374 | }, 375 | "source": [ 376 | "Please note that this, technically, now produces a 'wrong' result if we take the task, which did not specifiy this condition, seriously." 377 | ] 378 | } 379 | ] 380 | } -------------------------------------------------------------------------------- /2020/exercises/Solutions_Exercises_6_7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Solutions - Exercises 6 - 7.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "OES5Wz_Q9pMS" 20 | }, 21 | "source": [ 22 | "# Exercises 6 - 7" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "id": "UmfbDeLu-flW" 29 | }, 30 | "source": [ 31 | "## Exercise 6 - Slicing and Modifying\r\n", 32 | "\r\n", 33 | "There are (at least) two ways in which we can approach this exercise. We can either take a slice of characters (A) or tokenize the string (B).\r\n" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "metadata": { 39 | "id": "Be_-HytuBQym" 40 | }, 41 | "source": [ 42 | "text = 'Python programming can be fun.'" 43 | ], 44 | "execution_count": 2, 45 | "outputs": [] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "id": "yHKiEGIzVbYs" 51 | }, 52 | "source": [ 53 | "### Variant A (Slicing)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "metadata": { 59 | "id": "GVHgi4L9-vHP" 60 | }, 61 | "source": [ 62 | "third_word_a = text[19:23]\r\n", 63 | "third_word_a_upper = third_word_a.upper()" 64 | ], 65 | "execution_count": 11, 66 | "outputs": [] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "metadata": { 71 | "colab": { 72 | "base_uri": "https://localhost:8080/" 73 | }, 74 | "id": "A08Ly-1R_HCk", 75 | "outputId": "44b10f53-4d3f-4e5d-ff69-3ae2e9541b4f" 76 | }, 77 | "source": [ 78 | "print(third_word_a_upper)" 79 | ], 80 | "execution_count": 20, 81 | "outputs": [ 82 | { 83 | "output_type": "stream", 84 | "text": [ 85 | "CAN\n" 86 | ], 87 | "name": "stdout" 88 | } 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": { 94 | "id": "kLWassZ__QMw" 95 | }, 96 | "source": [ 97 | "If we don't want to count the characters, we can also use a combination of `find` and `len`." 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "metadata": { 103 | "id": "i263ejbw_Pu8" 104 | }, 105 | "source": [ 106 | "starting_point = text.find('can')\r\n", 107 | "end_point = starting_point + len('can')\r\n", 108 | "\r\n", 109 | "third_word_a = text[starting_point:end_point]\r\n", 110 | "third_word_a_upper = third_word_a.upper()" 111 | ], 112 | "execution_count": 17, 113 | "outputs": [] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "metadata": { 118 | "colab": { 119 | "base_uri": "https://localhost:8080/" 120 | }, 121 | "id": "YAaPEsX8UsAe", 122 | "outputId": "5a8f8be3-1e73-4525-ac80-87f487865689" 123 | }, 124 | "source": [ 125 | "print(third_word_a_upper)" 126 | ], 127 | "execution_count": 18, 128 | "outputs": [ 129 | { 130 | "output_type": "stream", 131 | "text": [ 132 | "CAN\n" 133 | ], 134 | "name": "stdout" 135 | } 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": { 141 | "id": "_VFR4swuU0Q7" 142 | }, 143 | "source": [ 144 | "Now that we have this solution, we can generalize even further and create a function that finds and modifies arbitrary substrings. Of course, this function does not make any sense, but it nicely shows how we can gradually approach generalized solutions." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "metadata": { 150 | "id": "c3dwhIa8UwtO" 151 | }, 152 | "source": [ 153 | "def find_and_uppercase(text, search):\r\n", 154 | " starting_point = text.find(search)\r\n", 155 | " end_point = starting_point + len(search)\r\n", 156 | "\r\n", 157 | " word = text[starting_point:end_point]\r\n", 158 | " word_upper = word.upper()\r\n", 159 | "\r\n", 160 | " return word_upper" 161 | ], 162 | "execution_count": 23, 163 | "outputs": [] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "metadata": { 168 | "colab": { 169 | "base_uri": "https://localhost:8080/", 170 | "height": 35 171 | }, 172 | "id": "msr-rLvzVVvV", 173 | "outputId": "cd678cd7-87cd-4eeb-e1a8-34fef9760996" 174 | }, 175 | "source": [ 176 | "find_and_uppercase(text, 'can')" 177 | ], 178 | "execution_count": 24, 179 | "outputs": [ 180 | { 181 | "output_type": "execute_result", 182 | "data": { 183 | "application/vnd.google.colaboratory.intrinsic+json": { 184 | "type": "string" 185 | }, 186 | "text/plain": [ 187 | "'CAN'" 188 | ] 189 | }, 190 | "metadata": { 191 | "tags": [] 192 | }, 193 | "execution_count": 24 194 | } 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": { 200 | "id": "b02QDFFSVfos" 201 | }, 202 | "source": [ 203 | "### Variant B (Tokenization)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "metadata": { 209 | "colab": { 210 | "base_uri": "https://localhost:8080/" 211 | }, 212 | "id": "419IRXkcVhjj", 213 | "outputId": "a233a998-5690-4695-d868-d09992dc73bc" 214 | }, 215 | "source": [ 216 | "tokenized = text.split() # Without any arguments, split will use whitespace\r\n", 217 | "tokenized" 218 | ], 219 | "execution_count": 30, 220 | "outputs": [ 221 | { 222 | "output_type": "execute_result", 223 | "data": { 224 | "text/plain": [ 225 | "['Python', 'programming', 'can', 'be', 'fun.']" 226 | ] 227 | }, 228 | "metadata": { 229 | "tags": [] 230 | }, 231 | "execution_count": 30 232 | } 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "metadata": { 238 | "id": "LH1cBbsZVkjN" 239 | }, 240 | "source": [ 241 | "third_word_b = tokenized[2]\r\n", 242 | "third_word_b_upper = third_word_b.upper()" 243 | ], 244 | "execution_count": 28, 245 | "outputs": [] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "metadata": { 250 | "colab": { 251 | "base_uri": "https://localhost:8080/", 252 | "height": 35 253 | }, 254 | "id": "i1CtXgXoVsUs", 255 | "outputId": "b1686cf3-84f2-4b84-cbc0-49798436d8f2" 256 | }, 257 | "source": [ 258 | "third_word_b_upper" 259 | ], 260 | "execution_count": 29, 261 | "outputs": [ 262 | { 263 | "output_type": "execute_result", 264 | "data": { 265 | "application/vnd.google.colaboratory.intrinsic+json": { 266 | "type": "string" 267 | }, 268 | "text/plain": [ 269 | "'CAN'" 270 | ] 271 | }, 272 | "metadata": { 273 | "tags": [] 274 | }, 275 | "execution_count": 29 276 | } 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": { 282 | "id": "GE5BUi2qV2hj" 283 | }, 284 | "source": [ 285 | "## Exercise 7 - Counting Tokens" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": { 291 | "id": "iSxN_weCWDp2" 292 | }, 293 | "source": [ 294 | "First we will download (`git clone`) the repository. This way, we will have access to the two files (`simple.txt` and `challenge.txt`)." 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "metadata": { 300 | "id": "ZhNDMR0gV7cY" 301 | }, 302 | "source": [ 303 | "%%capture\r\n", 304 | "!git clone https://github.com/IngoKl/python-programming-for-linguists" 305 | ], 306 | "execution_count": 38, 307 | "outputs": [] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": { 312 | "id": "L9kQs41kWyVp" 313 | }, 314 | "source": [ 315 | "First, we will open and read the file." 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "metadata": { 321 | "id": "qXTKWOLOWNdu" 322 | }, 323 | "source": [ 324 | "with open('python-programming-for-linguists/2020/data/tokenize/simple.txt', 'r') as f:\r\n", 325 | " text = f.read()" 326 | ], 327 | "execution_count": 45, 328 | "outputs": [] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "metadata": { 333 | "colab": { 334 | "base_uri": "https://localhost:8080/", 335 | "height": 35 336 | }, 337 | "id": "IssXg7NKW1ww", 338 | "outputId": "25df52cc-3df0-445a-98eb-4accf374d80b" 339 | }, 340 | "source": [ 341 | "text" 342 | ], 343 | "execution_count": 46, 344 | "outputs": [ 345 | { 346 | "output_type": "execute_result", 347 | "data": { 348 | "application/vnd.google.colaboratory.intrinsic+json": { 349 | "type": "string" 350 | }, 351 | "text/plain": [ 352 | "'The black cat chased the mouse.'" 353 | ] 354 | }, 355 | "metadata": { 356 | "tags": [] 357 | }, 358 | "execution_count": 46 359 | } 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": { 365 | "id": "VHaPMEzoW7ga" 366 | }, 367 | "source": [ 368 | "We can build a very simple tokenizer, just as above, by using the `str.split()` method." 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "metadata": { 374 | "colab": { 375 | "base_uri": "https://localhost:8080/" 376 | }, 377 | "id": "I-lFDl1EW3lP", 378 | "outputId": "358494d4-a3a6-4d04-e707-6d7b4d64996c" 379 | }, 380 | "source": [ 381 | "tokenized = text.split()\r\n", 382 | "\r\n", 383 | "tokenized" 384 | ], 385 | "execution_count": 50, 386 | "outputs": [ 387 | { 388 | "output_type": "execute_result", 389 | "data": { 390 | "text/plain": [ 391 | "['The', 'black', 'cat', 'chased', 'the', 'mouse.']" 392 | ] 393 | }, 394 | "metadata": { 395 | "tags": [] 396 | }, 397 | "execution_count": 50 398 | } 399 | ] 400 | }, 401 | { 402 | "cell_type": "markdown", 403 | "metadata": { 404 | "id": "MFHY8M19XDBA" 405 | }, 406 | "source": [ 407 | "Now we just need to get the length of the resulting list." 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "metadata": { 413 | "colab": { 414 | "base_uri": "https://localhost:8080/" 415 | }, 416 | "id": "D3Udp_EKXCvC", 417 | "outputId": "58497594-c7f9-46b1-9ab4-22ba1efaf636" 418 | }, 419 | "source": [ 420 | "len(tokenized)" 421 | ], 422 | "execution_count": 48, 423 | "outputs": [ 424 | { 425 | "output_type": "execute_result", 426 | "data": { 427 | "text/plain": [ 428 | "6" 429 | ] 430 | }, 431 | "metadata": { 432 | "tags": [] 433 | }, 434 | "execution_count": 48 435 | } 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": { 441 | "id": "q4yCKfyyXIRG" 442 | }, 443 | "source": [ 444 | "Now, as is requested in the exercise, we will put all of that into one function." 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "metadata": { 450 | "id": "-jhDk9eNXN8i" 451 | }, 452 | "source": [ 453 | "def count_tokens(file):\r\n", 454 | " with open(file, 'r') as f:\r\n", 455 | " text = f.read()\r\n", 456 | "\r\n", 457 | " tokenized = text.split()\r\n", 458 | "\r\n", 459 | " return len(tokenized)" 460 | ], 461 | "execution_count": 51, 462 | "outputs": [] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "metadata": { 467 | "colab": { 468 | "base_uri": "https://localhost:8080/" 469 | }, 470 | "id": "GQVePOM8XUyM", 471 | "outputId": "5f418242-6146-4afc-84ae-43c1a4c6b7d0" 472 | }, 473 | "source": [ 474 | "count_tokens('python-programming-for-linguists/2020/data/tokenize/simple.txt')" 475 | ], 476 | "execution_count": 52, 477 | "outputs": [ 478 | { 479 | "output_type": "execute_result", 480 | "data": { 481 | "text/plain": [ 482 | "6" 483 | ] 484 | }, 485 | "metadata": { 486 | "tags": [] 487 | }, 488 | "execution_count": 52 489 | } 490 | ] 491 | }, 492 | { 493 | "cell_type": "markdown", 494 | "metadata": { 495 | "id": "SUeN1RRdXcFf" 496 | }, 497 | "source": [ 498 | "Great, now let's try to use our function with the more challenging `challenge.txt` example." 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "metadata": { 504 | "colab": { 505 | "base_uri": "https://localhost:8080/" 506 | }, 507 | "id": "5xjsfKUHXjJG", 508 | "outputId": "81baba78-26cc-4828-d4e7-b5056623182c" 509 | }, 510 | "source": [ 511 | "count_tokens('python-programming-for-linguists/2020/data/tokenize/challenge.txt')" 512 | ], 513 | "execution_count": 53, 514 | "outputs": [ 515 | { 516 | "output_type": "execute_result", 517 | "data": { 518 | "text/plain": [ 519 | "11" 520 | ] 521 | }, 522 | "metadata": { 523 | "tags": [] 524 | }, 525 | "execution_count": 53 526 | } 527 | ] 528 | }, 529 | { 530 | "cell_type": "markdown", 531 | "metadata": { 532 | "id": "SLfBKsIWXnJP" 533 | }, 534 | "source": [ 535 | "This does not look good. Let's have a look at both the file and at the output of our tokenizer." 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "metadata": { 541 | "colab": { 542 | "base_uri": "https://localhost:8080/" 543 | }, 544 | "id": "GBj2D48aX2Xk", 545 | "outputId": "15a3d0ef-2300-42b8-8f04-63cf32f959e9" 546 | }, 547 | "source": [ 548 | "with open('python-programming-for-linguists/2020/data/tokenize/challenge.txt', 'r') as f:\r\n", 549 | " text = f.read()\r\n", 550 | "\r\n", 551 | "print(text)\r\n", 552 | "print(text.split())" 553 | ], 554 | "execution_count": 55, 555 | "outputs": [ 556 | { 557 | "output_type": "stream", 558 | "text": [ 559 | "Sue owed Ms. O'Neil $10. Unfortunately, she didn't have the money.\n", 560 | "['Sue', 'owed', 'Ms.', \"O'Neil\", '$10.', 'Unfortunately,', 'she', \"didn't\", 'have', 'the', 'money.']\n" 561 | ], 562 | "name": "stdout" 563 | } 564 | ] 565 | }, 566 | { 567 | "cell_type": "markdown", 568 | "metadata": { 569 | "id": "wYpoZjlGYBnb" 570 | }, 571 | "source": [ 572 | "Alright, there are various problems here.\r\n", 573 | "\r\n", 574 | "* *$* and *10* should arguably be split\r\n", 575 | "* *didn't* should also be split into two words or tokens\r\n", 576 | "* Due to the fact that we have *O'Neil*, we can't just split at the `'` character.\r\n", 577 | "* There's an extra space after *she* that potentially could cause trouble.\r\n", 578 | "\r\n", 579 | "Let's try to build a more robust tokenizer that can handle these cases. Our approach will be to modify the text before we do the tokenization.\r\n" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "metadata": { 585 | "id": "_YzliU6dYq-a" 586 | }, 587 | "source": [ 588 | "import re\r\n", 589 | "\r\n", 590 | "def count_tokens_optimized(text):\r\n", 591 | " # Replace double whitespace\r\n", 592 | " text = text.replace(' ', ' ')\r\n", 593 | "\r\n", 594 | " # Add a space between $/€ and numbers\r\n", 595 | " text = re.sub(r'(\\$|\\€)([0-9]*)\\b', r'\\1 \\2', text)\r\n", 596 | "\r\n", 597 | " # Add space between words and periods\r\n", 598 | " text = re.sub(r'(\\w+)(\\.)', r'\\1 \\2', text)\r\n", 599 | "\r\n", 600 | " # Account for the abbreviation\r\n", 601 | " text = text.replace(\"n't\", \" n't\")\r\n", 602 | "\r\n", 603 | " tokenized = text.split()\r\n", 604 | "\r\n", 605 | " return len(tokenized)" 606 | ], 607 | "execution_count": 71, 608 | "outputs": [] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "metadata": { 613 | "colab": { 614 | "base_uri": "https://localhost:8080/" 615 | }, 616 | "id": "X1j1eBMuZWlA", 617 | "outputId": "a7b5188e-1659-421d-e59d-f0dc68621d65" 618 | }, 619 | "source": [ 620 | "count_tokens_optimized(text)" 621 | ], 622 | "execution_count": 70, 623 | "outputs": [ 624 | { 625 | "output_type": "execute_result", 626 | "data": { 627 | "text/plain": [ 628 | "16" 629 | ] 630 | }, 631 | "metadata": { 632 | "tags": [] 633 | }, 634 | "execution_count": 70 635 | } 636 | ] 637 | }, 638 | { 639 | "cell_type": "markdown", 640 | "metadata": { 641 | "id": "gQ-qq1f0dXie" 642 | }, 643 | "source": [ 644 | "Great, our optimized function works very well. However, it will only work for this particular example and the edge cases (which are really not edge cases) we encountered here. Well, we at least have accounted for not just the *$* sign, but also for *€*." 645 | ] 646 | }, 647 | { 648 | "cell_type": "markdown", 649 | "metadata": { 650 | "id": "_82m0dw8dm7A" 651 | }, 652 | "source": [ 653 | "While state-of-the-art tokenizers use sophisticated language models to solve these problems, there are still good rule-based tokenizers out there. If you want to have a look at some real-word code, have a look at the [NLTKWordTokenizer](https://github.com/nltk/nltk/blob/develop/nltk/tokenize/destructive.py)." 654 | ] 655 | } 656 | ] 657 | } -------------------------------------------------------------------------------- /2020/notebooks/00_Python_Programming_for_Absolute_Beginners.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "00 - Python Programming for Absolute Beginners.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "fUSUA50eDcdR" 20 | }, 21 | "source": [ 22 | "# Python Programming for Linguists" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "metadata": { 28 | "id": "tewPOhqyDhXJ" 29 | }, 30 | "source": [ 31 | "print('Welcome to this workshop!')" 32 | ], 33 | "execution_count": null, 34 | "outputs": [] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "id": "VZ8hEKS5IaSl" 40 | }, 41 | "source": [ 42 | "## 1. Variables" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "metadata": { 48 | "id": "leI-q2-eIhAi" 49 | }, 50 | "source": [ 51 | "a = 13\r\n", 52 | "b = 'Hello World'\r\n", 53 | "c = 42.42" 54 | ], 55 | "execution_count": null, 56 | "outputs": [] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "metadata": { 61 | "id": "zsNFqnBwIpek" 62 | }, 63 | "source": [ 64 | "x = 5\r\n", 65 | "y = 10\r\n", 66 | "z = x + y" 67 | ], 68 | "execution_count": null, 69 | "outputs": [] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "metadata": { 74 | "id": "Tj4h2vNAIwG_" 75 | }, 76 | "source": [ 77 | "z" 78 | ], 79 | "execution_count": null, 80 | "outputs": [] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": { 85 | "id": "-lXJYXB0IydG" 86 | }, 87 | "source": [ 88 | "## 2. Lists" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "metadata": { 94 | "id": "F3LJQFHtI1WD" 95 | }, 96 | "source": [ 97 | "a = 6\r\n", 98 | "\r\n", 99 | "l = [1, 2, 3, 4, a]" 100 | ], 101 | "execution_count": null, 102 | "outputs": [] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "metadata": { 107 | "id": "KqOHqNqLI6h8" 108 | }, 109 | "source": [ 110 | "l[3]" 111 | ], 112 | "execution_count": null, 113 | "outputs": [] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "metadata": { 118 | "id": "AMn8I36JI9-U" 119 | }, 120 | "source": [ 121 | "la = [1, 2, 3]\r\n", 122 | "lb = [4, 5, 6]\r\n", 123 | "lol = [la, lb]" 124 | ], 125 | "execution_count": null, 126 | "outputs": [] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "metadata": { 131 | "id": "4-NijmHkJDgm" 132 | }, 133 | "source": [ 134 | "lol[0][1]" 135 | ], 136 | "execution_count": null, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "id": "GUYLPAK_JG_Q" 143 | }, 144 | "source": [ 145 | "l = [1, 2, 3, 4, 5]\r\n", 146 | "l" 147 | ], 148 | "execution_count": null, 149 | "outputs": [] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "metadata": { 154 | "id": "J0LjcxeRJJzw" 155 | }, 156 | "source": [ 157 | "l.append(0)\r\n", 158 | "l.append(3)\r\n", 159 | "l" 160 | ], 161 | "execution_count": null, 162 | "outputs": [] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "metadata": { 167 | "id": "nzehYNAgJNfv" 168 | }, 169 | "source": [ 170 | "l.sort()\r\n", 171 | "l" 172 | ], 173 | "execution_count": null, 174 | "outputs": [] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": { 179 | "id": "SwJG6SlcJTJ2" 180 | }, 181 | "source": [ 182 | "## 3. Loops" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "metadata": { 188 | "id": "HsK_tKdUJU3I" 189 | }, 190 | "source": [ 191 | "box = ['i0', 'i1', 'i2']" 192 | ], 193 | "execution_count": null, 194 | "outputs": [] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "metadata": { 199 | "id": "PagAlgemJYbQ" 200 | }, 201 | "source": [ 202 | "for item in box:\r\n", 203 | " print(item)" 204 | ], 205 | "execution_count": null, 206 | "outputs": [] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": { 211 | "id": "SWcLDIwcJdUN" 212 | }, 213 | "source": [ 214 | "## 4. If-Constructions" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "metadata": { 220 | "id": "yAzo2Nz0JfU-" 221 | }, 222 | "source": [ 223 | "a = 10" 224 | ], 225 | "execution_count": null, 226 | "outputs": [] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "metadata": { 231 | "id": "YF-O-gYIJoIT" 232 | }, 233 | "source": [ 234 | "if a > 15:\r\n", 235 | " print('A is greater than 15')\r\n", 236 | "else:\r\n", 237 | " print('A is not greater than 15')" 238 | ], 239 | "execution_count": null, 240 | "outputs": [] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": { 245 | "id": "Dk4Jd1a2JqQH" 246 | }, 247 | "source": [ 248 | "## 5. Functions" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "metadata": { 254 | "id": "kwbh8_SbJsBu" 255 | }, 256 | "source": [ 257 | "def add(a, b):\r\n", 258 | " result = a + b\r\n", 259 | " return result" 260 | ], 261 | "execution_count": null, 262 | "outputs": [] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "metadata": { 267 | "id": "LOQq9myIJwrw" 268 | }, 269 | "source": [ 270 | "add(5, 10)" 271 | ], 272 | "execution_count": null, 273 | "outputs": [] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "metadata": { 278 | "id": "yJOVBie9KTg6" 279 | }, 280 | "source": [ 281 | "print('Hello World')" 282 | ], 283 | "execution_count": null, 284 | "outputs": [] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "metadata": { 289 | "id": "4pBZpQYJKV0O" 290 | }, 291 | "source": [ 292 | "round(1.234, 0)" 293 | ], 294 | "execution_count": null, 295 | "outputs": [] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "metadata": { 300 | "id": "Mb3ReK4qJ3g-" 301 | }, 302 | "source": [ 303 | "## 6. Dictionaries" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "metadata": { 309 | "id": "NStegPvxJ4a3" 310 | }, 311 | "source": [ 312 | "word = {\r\n", 313 | " 'lemma': 'cat',\r\n", 314 | " 'pos': 'noun'\r\n", 315 | "}" 316 | ], 317 | "execution_count": null, 318 | "outputs": [] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "metadata": { 323 | "id": "D6iu_8g4J2Lw" 324 | }, 325 | "source": [ 326 | "word" 327 | ], 328 | "execution_count": null, 329 | "outputs": [] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": { 334 | "id": "vT0CuiNwZ5cy" 335 | }, 336 | "source": [ 337 | "## Example Counting\r\n", 338 | "See `scripts/count.py`" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "metadata": { 344 | "id": "Lpdtmi2eZ9AW" 345 | }, 346 | "source": [ 347 | "print('Counting up to ten!')\r\n", 348 | "\r\n", 349 | "for i in range(10):\r\n", 350 | " print(f'{i + 1} out of 10')\r\n", 351 | "\r\n", 352 | "print('Finished!')" 353 | ], 354 | "execution_count": null, 355 | "outputs": [] 356 | } 357 | ] 358 | } -------------------------------------------------------------------------------- /2020/scripts/count.py: -------------------------------------------------------------------------------- 1 | print('Counting up to ten!') 2 | 3 | for i in range(10): 4 | print(f'{i + 1} out of 10') 5 | 6 | print('Finished!') 7 | 8 | 9 | -------------------------------------------------------------------------------- /2020/scripts/helloworld.py: -------------------------------------------------------------------------------- 1 | # We often use _ to name variables if they are temporary or insignificant. 2 | for _ in range(10): 3 | print('Hello World!') 4 | -------------------------------------------------------------------------------- /2020/scripts/my_functions.py: -------------------------------------------------------------------------------- 1 | def print_10_times(text): 2 | """This function prints the given text ten times.""" 3 | for _ in range(10): 4 | print(text) 5 | -------------------------------------------------------------------------------- /2020/scripts/use_functions.py: -------------------------------------------------------------------------------- 1 | import my_functions 2 | 3 | text_to_print = 'Hi!' 4 | 5 | my_functions.print_10_times(text_to_print) 6 | -------------------------------------------------------------------------------- /2020/slides/00 - Python Programming for Absolute Beginners.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/slides/00 - Python Programming for Absolute Beginners.pdf -------------------------------------------------------------------------------- /2020/slides/00 - Python Programming for Absolute Beginners.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/slides/00 - Python Programming for Absolute Beginners.pptx -------------------------------------------------------------------------------- /2020/slides/01 - PizzaProblem.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/slides/01 - PizzaProblem.png -------------------------------------------------------------------------------- /2020/slides/01 - The Pizza Problem.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/slides/01 - The Pizza Problem.pdf -------------------------------------------------------------------------------- /2020/slides/01 - The Pizza Problem.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/slides/01 - The Pizza Problem.pptx -------------------------------------------------------------------------------- /2020/slides/02 - Working with Files, Texts, and Regular Expressions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/slides/02 - Working with Files, Texts, and Regular Expressions.pdf -------------------------------------------------------------------------------- /2020/slides/02 - Working with Files, Texts, and Regular Expressions.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/slides/02 - Working with Files, Texts, and Regular Expressions.pptx -------------------------------------------------------------------------------- /2020/slides/03 - Python for (Corpus) Linguists.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/slides/03 - Python for (Corpus) Linguists.pdf -------------------------------------------------------------------------------- /2020/slides/03 - Python for (Corpus) Linguists.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/slides/03 - Python for (Corpus) Linguists.pptx -------------------------------------------------------------------------------- /2020/slides/04 - Summary and Resources.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/slides/04 - Summary and Resources.pdf -------------------------------------------------------------------------------- /2020/slides/04 - Summary and Resources.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/slides/04 - Summary and Resources.pptx -------------------------------------------------------------------------------- /2020/slides/05 - Setting Up Your Development Environment.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/slides/05 - Setting Up Your Development Environment.pdf -------------------------------------------------------------------------------- /2020/slides/05 - Setting Up Your Development Environment.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2020/slides/05 - Setting Up Your Development Environment.pptx -------------------------------------------------------------------------------- /2020/youtube-video-descriptions.md: -------------------------------------------------------------------------------- 1 | # YouTube Video Descriptions 2 | 3 | ## 00 - Python Programming for Absolute Beginners 4 | 5 | This video is part of my 2020 "Python Programming for Linguists" workshop. 6 | Workshop: https://github.com/IngoKl/python-programming-for-linguists 7 | 8 | Content 9 | 00:00 Introduction & Overview 10 | 07:22 Python Programming for Absolute Beginners 11 | 22:45 Google Colab 12 | 25:52 Basics Overview 13 | 29:20 1. Variables 14 | 23:45 2. Lists 15 | 40:24 3. Loops 16 | 42:40 4. If-Constructions 17 | 44:45 5. Functions 18 | 49:50 6. Dictionaries 19 | 20 | ## 01 - The Pizza Problem 21 | 22 | This video is part of my 2020 "Python Programming for Linguists" workshop. 23 | Workshop: https://github.com/IngoKl/python-programming-for-linguists 24 | 25 | Content 26 | 00:00 The Pizza Problem 27 | 03:17 Good Solutions 28 | 08:10 Modeling Pizzas 29 | 13:10 Determining Areas 30 | 22:00 PTER and Best/Worst Pizza 31 | 30:30 A Simple Algorithm 32 | 43:25 Bonus Exercises 33 | 34 | ## 02 - Working with Files, Texts, and Regular Expressions 35 | 36 | This video is part of my 2020 "Python Programming for Linguists" workshop. 37 | Workshop: https://github.com/IngoKl/python-programming-for-linguists 38 | 39 | Content 40 | 00:00 Files, Texts, and Regular Expressions 41 | 00:11 Reading and Writing Files 42 | 08:20 Working with Text 43 | 15:30 Difflib 44 | 15:46 import Statement 45 | 20:23 Regular Expressions 46 | 29:57 TextDirectory 47 | 48 | ## 03 - Python for (Corpus) Linguists (Live Recording) 49 | 50 | Important Note: There is a newer version of this video/module available! Please consider watching the new (2022) video instead of this recording: https://youtu.be/DLyVL0mEISU 51 | 52 | This video is part of my 2020 "Python Programming for Linguists" workshop. 53 | Workshop: https://github.com/IngoKl/python-programming-for-linguists 54 | 55 | This is the recording of the workshop live session "Python for (Corpus) Linguists". During the stream, I live coded exercises 8 to 16. The recording has been slightly shortened, and I have also added some explanatory slides. Given that this is a recording of a live session, it is not as polished as a screencast, and you can also watch me making mistakes along the way! :) 56 | 57 | The finished and somewhat polished Python notebook (Google Colab), as well as the slides, are available in the repository. 58 | 59 | Content 60 | 00:00 Live Session 61 | 03:23 New Syntax and Tools 62 | 15:16 Exercise Environment 63 | 18:00 Ex. 8 - Concordancer 64 | 37:19 Ex. 9 - N-Grams 65 | 47:15 Ex. 10 - Frequency Analysis 66 | 53:01 Ex. 11 - Computing Basic Statistics 67 | 01:24:37 Ex. 12 - NLTK Stemming, Lemmatization, and WordNet 68 | 01:35:40 Ex. 13 - spaCy Tagging 69 | 01:48:50 Ex. 14 - XML Parsing and XPath 70 | 02:02:40 Ex. 15 - Web Scraping 71 | 02:11:15 Ex. 16 - Putting Everything Together (Keywords) 72 | 73 | ## 04 - Summary and Resources.pptx 74 | 75 | This video is part of my 2020 "Python Programming for Linguists" workshop. 76 | Workshop: https://github.com/IngoKl/python-programming-for-linguists 77 | 78 | Content 79 | 00:00 Conclusion 80 | 00:18 Learning Objectives 81 | 06:13 What You Might Have Missed 82 | 07:09 No-Code Generation 83 | 09:23 Books 84 | 09:56 Online Courses 85 | 11:17 Other Resources 86 | 87 | ## 05 - Setting Up Your Development Environment 88 | 89 | This video is part of my 2020 "Python Programming for Linguists" workshop. 90 | Workshop: https://github.com/IngoKl/python-programming-for-linguists 91 | -------------------------------------------------------------------------------- /2021/data/corpora/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2021/data/corpora/.gitkeep -------------------------------------------------------------------------------- /2021/data/data.md: -------------------------------------------------------------------------------- 1 | # Data 2 | 3 | ## Downloadable Data 4 | 5 | ### The Adventures of Sherlock Holmes 6 | 7 | The [Complete Sherlock Holmes](https://sherlock-holm.es/ascii/) is available in plain text (ASCII) format from [sherlock-holm.es](https://sherlock-holm.es). 8 | 9 | Using the script below, you can download the 12 short stories contained in the *The Adventures of Sherlock Holmes* collection. 10 | 11 | ```bash 12 | !git clone https://github.com/IngoKl/python-programming-for-linguists 13 | !cd python-programming-for-linguists/2021/data && sh download_sherlockholmes.sh 14 | ``` 15 | 16 | The data, as individual text files, then will be available in `2021/data/corpora/holmes`. 17 | -------------------------------------------------------------------------------- /2021/data/download_sherlockholmes.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | mkdir -p ./corpora/holmes 4 | mkdir temp 5 | 6 | cd ./corpora/holmes 7 | wget https://sherlock-holm.es/stories/plain-text/scan.txt 8 | wget https://sherlock-holm.es/stories/plain-text/redh.txt 9 | wget https://sherlock-holm.es/stories/plain-text/iden.txt 10 | wget https://sherlock-holm.es/stories/plain-text/bosc.txt 11 | wget https://sherlock-holm.es/stories/plain-text/five.txt 12 | wget https://sherlock-holm.es/stories/plain-text/twis.txt 13 | wget https://sherlock-holm.es/stories/plain-text/blue.txt 14 | wget https://sherlock-holm.es/stories/plain-text/spec.txt 15 | wget https://sherlock-holm.es/stories/plain-text/engr.txt 16 | wget https://sherlock-holm.es/stories/plain-text/nobl.txt 17 | wget https://sherlock-holm.es/stories/plain-text/bery.txt 18 | wget https://sherlock-holm.es/stories/plain-text/copp.txt -------------------------------------------------------------------------------- /2021/exercises/Additional_Exercises_Frequency_Distribution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Additional_Exercises_Frequency_Distribution", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "M8LHZw5jxwmW" 23 | }, 24 | "source": [ 25 | "# Additional Exercises - Frequency Distribution\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "WFTxNWfN8Vua" 32 | }, 33 | "source": [ 34 | "In this notebook (set of exercises) we will create a tool that, given a corpus of text files and a search term, is able to provide us with information about the frequency distribution of the term across the files in the corpus.\n", 35 | "\n", 36 | "The solution we will implement is not perfect and very much influenced by educational decisions. After finishing the exercise, you will be invited to think about ways of optimizing the solution." 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "id": "gHvM1cmJ8qd7" 43 | }, 44 | "source": [ 45 | "## Setup\n", 46 | "\n", 47 | "This is a little bit of setup. First, we import necessary libraries. Of course, feel free to add libraries as needed! After, we clone the workshop repository and use the provided helper script to download a series of Sherlock Holmes short stories." 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "metadata": { 53 | "id": "CTSVAtts8RdO" 54 | }, 55 | "source": [ 56 | "# Regular Expressions\n", 57 | "import re\n", 58 | "\n", 59 | "# Pathlib\n", 60 | "from pathlib import Path\n", 61 | "\n", 62 | "# Counter for getting frequencies\n", 63 | "from collections import Counter\n", 64 | "\n", 65 | "# DataFrames\n", 66 | "import pandas as pd\n", 67 | "\n", 68 | "# Visualization\n", 69 | "import seaborn as sns\n", 70 | "import matplotlib.pyplot as plt" 71 | ], 72 | "execution_count": null, 73 | "outputs": [] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "metadata": { 78 | "id": "jCqH7MeyxzRa" 79 | }, 80 | "source": [ 81 | "%%capture\n", 82 | "!git clone https://github.com/IngoKl/python-programming-for-linguists\n", 83 | "!cd python-programming-for-linguists/2021/data && sh download_sherlockholmes.sh" 84 | ], 85 | "execution_count": null, 86 | "outputs": [] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": { 91 | "id": "l-txyxOP9G5s" 92 | }, 93 | "source": [ 94 | "## Step 1: Preparing the Data\n", 95 | "\n", 96 | "After running the `download_sherlockholmes.sh` script above, you will have 12 short stories (*The Adventures of Sherlock Holmes*) in the `python-programming-for-linguists/2021/data/corpora/holmes` folder.\n", 97 | "\n", 98 | "The goal of this first step is to read and prepare the data. Your goal will be to create the data structure below. \n", 99 | "\n", 100 | "Please note that there are other, better and more efficient, data structures to achieve the same goals. However, we are building a solution that mirrors practices in corpus linguistics without being too conscious of memory of computation limitations." 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "metadata": { 106 | "id": "5nlXgAZq9GYH" 107 | }, 108 | "source": [ 109 | "corpus = [\n", 110 | " {\n", 111 | " 'filename': 'bery.txt', \n", 112 | " 'text': '...', \n", 113 | " 'story_title': 'THE ADVENTURE OF THE BERYL CORONET', \n", 114 | " 'length': None, \n", 115 | " 'frequencies': {}\n", 116 | " },\n", 117 | "]\n", 118 | "\n", 119 | "corpus" 120 | ], 121 | "execution_count": null, 122 | "outputs": [] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": { 127 | "id": "bPGW5NIW-nGN" 128 | }, 129 | "source": [ 130 | "Obviously, your solution will create a list with more than one item. The `frequencies` dictionary as well as `length` can be empty for now. We will populate it in the next step. `text` is supposed to contain the actual text.\n", 131 | "\n", 132 | "If you want to, you can preprocess the text before adding it to `corpus`.\n", 133 | "\n", 134 | "The trickiest bit is getting the `story_title` from the file. Have a look at one of the actual text files and remember what you've learned about regular expressions." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "metadata": { 140 | "id": "Vx13B3qyAD3q" 141 | }, 142 | "source": [ 143 | "def get_story_title(text):\n", 144 | " # YOUR CODE\n", 145 | " title = None\n", 146 | "\n", 147 | " return title\n", 148 | "\n", 149 | "def preprocess_text(text):\n", 150 | " # YOUR CODE\n", 151 | "\n", 152 | " return text" 153 | ], 154 | "execution_count": null, 155 | "outputs": [] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "metadata": { 160 | "id": "y_17lprz_MFg" 161 | }, 162 | "source": [ 163 | "corpus = []\n", 164 | "files = Path('python-programming-for-linguists/2021/data/corpora/holmes').glob('*.txt')\n", 165 | "\n", 166 | "# YOUR CODE" 167 | ], 168 | "execution_count": null, 169 | "outputs": [] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": { 174 | "id": "_qizNQCTDTSt" 175 | }, 176 | "source": [ 177 | "## Step 2: Getting the Frequencies\n", 178 | "\n", 179 | "You will need to generate frequency tables and add them to `corpus`. At the same time, you should populate `length` with the number of tokens in the document.\n", 180 | "\n", 181 | "This also means that you will have to tokenize the stories first. Remember that you can use `dict()` to turn a `Counter` object into a dictionary.\n", 182 | "\n", 183 | "Ultimately, `frequency`, for each story, should contain a structure like below:" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "metadata": { 189 | "id": "gnZ_FDT8Dn4P" 190 | }, 191 | "source": [ 192 | "frequencies = {\n", 193 | " 'word_a': 42,\n", 194 | " 'word_b': 12,\n", 195 | "}" 196 | ], 197 | "execution_count": null, 198 | "outputs": [] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "metadata": { 203 | "id": "i5VrpJaVDufW" 204 | }, 205 | "source": [ 206 | "def tokenize(text):\n", 207 | " # YOUR CODE\n", 208 | " pass" 209 | ], 210 | "execution_count": null, 211 | "outputs": [] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "metadata": { 216 | "id": "5ObMUj8gDy5f" 217 | }, 218 | "source": [ 219 | "# YOUR CODE" 220 | ], 221 | "execution_count": null, 222 | "outputs": [] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": { 227 | "id": "dhScros9GJeW" 228 | }, 229 | "source": [ 230 | "## Step 3: Frequencies and Frequency Distribution\n", 231 | "\n", 232 | "Now you will need to write a function that takes a `corpus` as well as `search_term`. You will also need to account for both the absolute as well as the relative (per 1,000 tokens) frequencies.\n", 233 | "\n", 234 | "If you need to check whether something is in a dictionary, you can do the following: `if x in y`\n", 235 | "\n", 236 | "You will generate a frequency table for the search term that looks as follows:" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "metadata": { 242 | "id": "wclpClooGXZr" 243 | }, 244 | "source": [ 245 | "frequency_table = {\n", 246 | " # Filename: (abs_frequency, rel_frequency_per_1000)\n", 247 | " 'story_title_a': (1, 2),\n", 248 | " 'story_title_b': (1, 2)\n", 249 | "}" 250 | ], 251 | "execution_count": null, 252 | "outputs": [] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "metadata": { 257 | "id": "7ZZufRoQG_S1" 258 | }, 259 | "source": [ 260 | "def get_frequencies(corpus, search_term):\n", 261 | " frequency_table = {}\n", 262 | "\n", 263 | " # YOUR CODE\n", 264 | "\n", 265 | " return frequency_table" 266 | ], 267 | "execution_count": null, 268 | "outputs": [] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": { 273 | "id": "fzoGahWOHF5-" 274 | }, 275 | "source": [ 276 | "The following code is **provided for you**. You don't have to change anything here. Just need to make sure that you `get_frequencies` function works well with it. \n", 277 | "\n", 278 | "* We will nicely print the results\n", 279 | "* We will calculate a very basic dispersion statistic (Range_2)\n", 280 | "* We will plot the results using `seaborn`" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "metadata": { 286 | "id": "B77mZcOA8H_4" 287 | }, 288 | "source": [ 289 | "def plot_frequency_table(frequency_table, search_term):\n", 290 | "\n", 291 | " df = pd.DataFrame(frequency_table).transpose()\n", 292 | " df.columns = ['abs_frequency', 'rel_frequency']\n", 293 | " df = df.sort_values('rel_frequency', ascending=False)\n", 294 | "\n", 295 | " ax = sns.barplot(y=df.index, x='rel_frequency', data=df, color='#EF2D56')\n", 296 | " ax.set_title(f'Frequency Distribution of {search_term} (per 1,000 Tokens')" 297 | ], 298 | "execution_count": null, 299 | "outputs": [] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "metadata": { 304 | "id": "aEOGDIi-HNTR" 305 | }, 306 | "source": [ 307 | "search_term = 'watson'\n", 308 | "\n", 309 | "parts_with_st = 0\n", 310 | "frequency_table = get_frequencies(corpus, search_term)\n", 311 | "\n", 312 | "print(f'Distribution of \"{search_term}\":\\n')\n", 313 | "for s in frequency_table:\n", 314 | " \n", 315 | " if frequency_table[s][0] > 0:\n", 316 | " parts_with_st += 1\n", 317 | "\n", 318 | " print(f'- {frequency_table[s][0]} ({round(frequency_table[s][1], 2)} per 1,000 tokens) in {s}')\n", 319 | "\n", 320 | "# Range_2\n", 321 | "range_2 = ( parts_with_st / len(frequency_table.keys()) ) * 100\n", 322 | "\n", 323 | "print(f'\\nThe Range_2 is: {round(range_2, 2)}%\\n')\n", 324 | "\n", 325 | "plot_frequency_table(frequency_table, search_term)" 326 | ], 327 | "execution_count": null, 328 | "outputs": [] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": { 333 | "id": "t98Pxz8uGkSn" 334 | }, 335 | "source": [ 336 | "## Making it Better\n", 337 | "\n", 338 | "Now we have a working solution that does what we set out to do. However, it is definitely not the most efficient or most elegant solution.\n", 339 | "\n", 340 | "Can you think of some ways of making this program better?" 341 | ] 342 | } 343 | ] 344 | } -------------------------------------------------------------------------------- /2021/exercises/Additional_Exercises_RegEx.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Additional_Exercises_RegEx.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [ 9 | "Cqxm6b_VpxGz" 10 | ] 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | } 19 | }, 20 | "cells": [ 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "edUynZOdKwve" 25 | }, 26 | "source": [ 27 | "# Additional Exercises - Regular Expressions" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "metadata": { 33 | "id": "rl-SxvedLN-3" 34 | }, 35 | "source": [ 36 | "import re" 37 | ], 38 | "execution_count": null, 39 | "outputs": [] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "XEmhlAOhLqIn" 45 | }, 46 | "source": [ 47 | "Please be aware that there is **a [video](https://www.youtube.com/watch?v=GGEveroG3Fg)** going through these exercises. 🙂" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "id": "Cqxm6b_VpxGz" 54 | }, 55 | "source": [ 56 | "## Helper Function\n", 57 | "\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "id": "wJANYqBVp8eA" 64 | }, 65 | "source": [ 66 | "This is a helper function that will help you during the exercises. You will not have to change it but feel free to play around!" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "udmQReyrMAzw" 73 | }, 74 | "source": [ 75 | "def regex_exercise(text, regular_expression, desired_output):\n", 76 | " '''This is a helper function that will run a re.findall() \n", 77 | " and return matches nicely.'''\n", 78 | "\n", 79 | " if regular_expression:\n", 80 | " regular_expression = f'({regular_expression})'\n", 81 | " matches = re.findall(regular_expression, text)\n", 82 | "\n", 83 | " print(f'Matching \"{regular_expression}\" and \"{text[0:10]} ...\"')\n", 84 | "\n", 85 | " if matches == desired_output:\n", 86 | " print(f'\\nSUCCESS')\n", 87 | " print(f'Desired Output: {desired_output}')\n", 88 | " print(f'Your Output: {matches}')\n", 89 | " else:\n", 90 | " print(f'\\nTRY AGAIN')\n", 91 | " print(f'Desired Output: {desired_output}')\n", 92 | " print(f'Your Output: {matches}')" 93 | ], 94 | "execution_count": null, 95 | "outputs": [] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "id": "SY2U1-NFMuml" 101 | }, 102 | "source": [ 103 | "## Exercises" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": { 109 | "id": "oOqsVRcbp3YF" 110 | }, 111 | "source": [ 112 | "### Demo" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "metadata": { 118 | "colab": { 119 | "base_uri": "https://localhost:8080/" 120 | }, 121 | "id": "oP67U_LLp2Zd", 122 | "outputId": "72790ee7-98fa-4d52-ab65-0ca3a3f15056" 123 | }, 124 | "source": [ 125 | "text = 'She had one cat while her friends had three cats.'\n", 126 | "desired_output = ['one cat', 'three cats']\n", 127 | "\n", 128 | "regular_expression = '\\w+ cats?'\n", 129 | "\n", 130 | "regex_exercise(text, regular_expression, desired_output)" 131 | ], 132 | "execution_count": null, 133 | "outputs": [ 134 | { 135 | "output_type": "stream", 136 | "text": [ 137 | "Matching \"(\\w+ cats?)\" and \"She had on ...\"\n", 138 | "\n", 139 | "SUCCESS\n", 140 | "Desired Output: ['one cat', 'three cats']\n", 141 | "Your Output: ['one cat', 'three cats']\n" 142 | ], 143 | "name": "stdout" 144 | } 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": { 150 | "id": "iVecUGESV4vY" 151 | }, 152 | "source": [ 153 | "### Exercise 1 - Color and Colour\n", 154 | "\n", 155 | "Write an expression that matches both `the color X` and `the colour X`." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "metadata": { 161 | "id": "aMfWDMlWV-RQ" 162 | }, 163 | "source": [ 164 | "text = '''She likes the color fern while he prefers the colour lilac.'''\n", 165 | "desired_output = [('the color fern', 'color'), ('the colour lilac', 'colour')]\n", 166 | "\n", 167 | "# YOUR CODE GOES HERE\n", 168 | "regular_expression = r''\n", 169 | "\n", 170 | "regex_exercise(text, regular_expression, desired_output)" 171 | ], 172 | "execution_count": null, 173 | "outputs": [] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": { 178 | "id": "CSKQ8PX_VOBw" 179 | }, 180 | "source": [ 181 | "### Exercise 2: -ing Forms\n", 182 | "\n", 183 | "Write an expression that matches all words (tokens) ending in -ing." 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "metadata": { 189 | "id": "n8LkYEfIV9Ex" 190 | }, 191 | "source": [ 192 | "text = '''The two lions are moving towards their prey; \n", 193 | "waiting for it to make a move.'''\n", 194 | "desired_output = ['moving', 'waiting']\n", 195 | "\n", 196 | "# YOUR CODE GOES HERE\n", 197 | "regular_expression = r''\n", 198 | "\n", 199 | "regex_exercise(text, regular_expression, desired_output)" 200 | ], 201 | "execution_count": null, 202 | "outputs": [] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": { 207 | "id": "ceflSXLCbHv_" 208 | }, 209 | "source": [ 210 | "### Exercise 3 - CCV Syllables / Consonant Clusters\n", 211 | "\n", 212 | "Write an expresion that matches CCV syllables (e.g., blue, glow, shred, free). Since this is not a class on phonetics, we will simply assume vowels to be a, e, i, o, u, (y)." 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "metadata": { 218 | "id": "GBOABwoCV-u-" 219 | }, 220 | "source": [ 221 | "text = '''The blue monster was set free.'''\n", 222 | "desired_output = ['blu', 'ste', 'fre']\n", 223 | "\n", 224 | "# YOUR CODE GOES HERE\n", 225 | "regular_expression = f''\n", 226 | "\n", 227 | "regex_exercise(text, regular_expression, desired_output)" 228 | ], 229 | "execution_count": null, 230 | "outputs": [] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": { 235 | "id": "drNa_7DEV5Ej" 236 | }, 237 | "source": [ 238 | "\n", 239 | "### Exercise 4 - Old School Annotations\n", 240 | "\n", 241 | "Write an expression that matches all nouns from this BROWN-style example." 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "metadata": { 247 | "id": "YHndLo0hV_bB" 248 | }, 249 | "source": [ 250 | "text = '''Many/ap visitors/nns will/md be/be taking/vbg seats/nns. \n", 251 | "The/at organizer/nn has/hvz said/vbn the/at event/nn \n", 252 | "will/md be/be successful/jj.'''\n", 253 | "desired_output = [('visitors/nns', 'visitors'), ('seats/nns', 'seats'), ('organizer/nn', 'organizer'), ('event/nn', 'event')]\n", 254 | "\n", 255 | "# YOUR CODE GOES HERE\n", 256 | "regular_expression = r''\n", 257 | "\n", 258 | "regex_exercise(text, regular_expression, desired_output)" 259 | ], 260 | "execution_count": null, 261 | "outputs": [] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": { 266 | "id": "JrmZL4NdV5JL" 267 | }, 268 | "source": [ 269 | "### Exercise 5 - Determiners\n", 270 | "\n", 271 | "Write an expression that matches all determiners (defined as: a, an, the) in the text." 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "metadata": { 277 | "id": "vMv-esVRVb6o" 278 | }, 279 | "source": [ 280 | "text = '''The car and a bike were driving down \n", 281 | "an alley when the accident happened.'''\n", 282 | "desired_output = ['The', 'a', 'an', 'the']\n", 283 | "\n", 284 | "# YOUR CODE GOES HERE\n", 285 | "regular_expression = r''\n", 286 | "\n", 287 | "regex_exercise(text, regular_expression, desired_output)" 288 | ], 289 | "execution_count": null, 290 | "outputs": [] 291 | } 292 | ] 293 | } -------------------------------------------------------------------------------- /2021/exercises/Additional_Exercises_Solutions_RegEx.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Additional_Exercises_RegEx.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [ 9 | "Cqxm6b_VpxGz" 10 | ] 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | } 19 | }, 20 | "cells": [ 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "id": "edUynZOdKwve" 25 | }, 26 | "source": [ 27 | "# Additional Exercises - Regular Expressions" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "metadata": { 33 | "id": "rl-SxvedLN-3" 34 | }, 35 | "source": [ 36 | "import re" 37 | ], 38 | "execution_count": null, 39 | "outputs": [] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "id": "XEmhlAOhLqIn" 45 | }, 46 | "source": [ 47 | "Please be aware that there is **a [video](https://www.youtube.com/watch?v=GGEveroG3Fg)** going through these exercises. 🙂" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "id": "Cqxm6b_VpxGz" 54 | }, 55 | "source": [ 56 | "## Helper Function\n", 57 | "\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "id": "wJANYqBVp8eA" 64 | }, 65 | "source": [ 66 | "This is a helper function that will help you during the exercises. You will not have to change it but feel free to play around!" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "metadata": { 72 | "id": "udmQReyrMAzw" 73 | }, 74 | "source": [ 75 | "def regex_exercise(text, regular_expression, desired_output):\n", 76 | " '''This is a helper function that will run a re.findall() \n", 77 | " and return matches nicely.'''\n", 78 | "\n", 79 | " if regular_expression:\n", 80 | " regular_expression = f'({regular_expression})'\n", 81 | " matches = re.findall(regular_expression, text)\n", 82 | "\n", 83 | " print(f'Matching \"{regular_expression}\" and \"{text[0:10]} ...\"')\n", 84 | "\n", 85 | " if matches == desired_output:\n", 86 | " print(f'\\nSUCCESS')\n", 87 | " print(f'Desired Output: {desired_output}')\n", 88 | " print(f'Your Output: {matches}')\n", 89 | " else:\n", 90 | " print(f'\\nTRY AGAIN')\n", 91 | " print(f'Desired Output: {desired_output}')\n", 92 | " print(f'Your Output: {matches}')" 93 | ], 94 | "execution_count": null, 95 | "outputs": [] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": { 100 | "id": "SY2U1-NFMuml" 101 | }, 102 | "source": [ 103 | "## Exercises" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": { 109 | "id": "oOqsVRcbp3YF" 110 | }, 111 | "source": [ 112 | "### Demo" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "metadata": { 118 | "colab": { 119 | "base_uri": "https://localhost:8080/" 120 | }, 121 | "id": "oP67U_LLp2Zd", 122 | "outputId": "72790ee7-98fa-4d52-ab65-0ca3a3f15056" 123 | }, 124 | "source": [ 125 | "text = 'She had one cat while her friends had three cats.'\n", 126 | "desired_output = ['one cat', 'three cats']\n", 127 | "\n", 128 | "regular_expression = '\\w+ cats?'\n", 129 | "\n", 130 | "regex_exercise(text, regular_expression, desired_output)" 131 | ], 132 | "execution_count": null, 133 | "outputs": [ 134 | { 135 | "output_type": "stream", 136 | "text": [ 137 | "Matching \"(\\w+ cats?)\" and \"She had on ...\"\n", 138 | "\n", 139 | "SUCCESS\n", 140 | "Desired Output: ['one cat', 'three cats']\n", 141 | "Your Output: ['one cat', 'three cats']\n" 142 | ], 143 | "name": "stdout" 144 | } 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": { 150 | "id": "iVecUGESV4vY" 151 | }, 152 | "source": [ 153 | "### Exercise 1 - Color and Colour\n", 154 | "\n", 155 | "Write an expression that matches both `the color X` and `the colour X`." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "metadata": { 161 | "colab": { 162 | "base_uri": "https://localhost:8080/" 163 | }, 164 | "id": "aMfWDMlWV-RQ", 165 | "outputId": "cfb7ce29-33a2-4e5f-da86-768d61577a9f" 166 | }, 167 | "source": [ 168 | "text = '''She likes the color fern while he prefers the colour lilac.'''\n", 169 | "desired_output = [('the color fern', 'color'), ('the colour lilac', 'colour')]\n", 170 | "\n", 171 | "# YOUR CODE GOES HERE\n", 172 | "regular_expression = r'the (color|colour) \\w+'\n", 173 | "#regular_expression = r'the colou?r \\w+'\n", 174 | "\n", 175 | "regex_exercise(text, regular_expression, desired_output)" 176 | ], 177 | "execution_count": null, 178 | "outputs": [ 179 | { 180 | "output_type": "stream", 181 | "text": [ 182 | "Matching \"(the (color|colour) \\w+)\" and \"She likes ...\"\n", 183 | "\n", 184 | "SUCCESS\n", 185 | "Desired Output: [('the color fern', 'color'), ('the colour lilac', 'colour')]\n", 186 | "Your Output: [('the color fern', 'color'), ('the colour lilac', 'colour')]\n" 187 | ], 188 | "name": "stdout" 189 | } 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": { 195 | "id": "CSKQ8PX_VOBw" 196 | }, 197 | "source": [ 198 | "### Exercise 2: -ing Forms\n", 199 | "\n", 200 | "Write an expression that matches all words (tokens) ending in -ing." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "metadata": { 206 | "colab": { 207 | "base_uri": "https://localhost:8080/" 208 | }, 209 | "id": "n8LkYEfIV9Ex", 210 | "outputId": "d9314c74-f7d2-4460-e8a4-4c2467301cfd" 211 | }, 212 | "source": [ 213 | "text = '''The two lions are moving towards their prey; \n", 214 | "waiting for it to make a move.'''\n", 215 | "desired_output = ['moving', 'waiting']\n", 216 | "\n", 217 | "# YOUR CODE GOES HERE\n", 218 | "regular_expression = r'\\w+ing'\n", 219 | "\n", 220 | "regex_exercise(text, regular_expression, desired_output)" 221 | ], 222 | "execution_count": null, 223 | "outputs": [ 224 | { 225 | "output_type": "stream", 226 | "text": [ 227 | "Matching \"(\\w+ing)\" and \"The two li ...\"\n", 228 | "\n", 229 | "SUCCESS\n", 230 | "Desired Output: ['moving', 'waiting']\n", 231 | "Your Output: ['moving', 'waiting']\n" 232 | ], 233 | "name": "stdout" 234 | } 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": { 240 | "id": "ceflSXLCbHv_" 241 | }, 242 | "source": [ 243 | "### Exercise 3 - CCV Syllables / Consonant Clusters\n", 244 | "\n", 245 | "Write an expresion that matches CCV syllables (e.g., blue, glow, shred, free). Since this is not a class on phonetics, we will simply assume vowels to be a, e, i, o, u, (y)." 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "metadata": { 251 | "colab": { 252 | "base_uri": "https://localhost:8080/" 253 | }, 254 | "id": "GBOABwoCV-u-", 255 | "outputId": "4165d0a3-79a6-4d34-a635-c4c59f62c75d" 256 | }, 257 | "source": [ 258 | "text = '''The blue monster was set free.'''\n", 259 | "desired_output = ['blu', 'ste', 'fre']\n", 260 | "\n", 261 | "# YOUR CODE GOES HERE\n", 262 | "v = '[aeiouy]'\n", 263 | "c = '[bcdfgjklmnpqstvxzhrw]'\n", 264 | "regular_expression = f'{c}{c}{v}'\n", 265 | "\n", 266 | "regex_exercise(text, regular_expression, desired_output)" 267 | ], 268 | "execution_count": null, 269 | "outputs": [ 270 | { 271 | "output_type": "stream", 272 | "text": [ 273 | "Matching \"([bcdfgjklmnpqstvxzhrw][bcdfgjklmnpqstvxzhrw][aeiouy])\" and \"The blue m ...\"\n", 274 | "\n", 275 | "SUCCESS\n", 276 | "Desired Output: ['blu', 'ste', 'fre']\n", 277 | "Your Output: ['blu', 'ste', 'fre']\n" 278 | ], 279 | "name": "stdout" 280 | } 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "metadata": { 286 | "id": "drNa_7DEV5Ej" 287 | }, 288 | "source": [ 289 | "\n", 290 | "### Exercise 4 - Old School Annotations\n", 291 | "\n", 292 | "Write an expression that matches all nouns from this BROWN-style example." 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "metadata": { 298 | "colab": { 299 | "base_uri": "https://localhost:8080/" 300 | }, 301 | "id": "YHndLo0hV_bB", 302 | "outputId": "814dd0c0-84f0-4807-8c9b-dabf4a881a80" 303 | }, 304 | "source": [ 305 | "text = '''Many/ap visitors/nns will/md be/be taking/vbg seats/nns. \n", 306 | "The/at organizer/nn has/hvz said/vbn the/at event/nn \n", 307 | "will/md be/be successful/jj.'''\n", 308 | "desired_output = [('visitors/nns', 'visitors'), ('seats/nns', 'seats'), ('organizer/nn', 'organizer'), ('event/nn', 'event')]\n", 309 | "\n", 310 | "# YOUR CODE GOES HERE\n", 311 | "regular_expression = r'(\\w+)/n\\w*'\n", 312 | "\n", 313 | "regex_exercise(text, regular_expression, desired_output)" 314 | ], 315 | "execution_count": null, 316 | "outputs": [ 317 | { 318 | "output_type": "stream", 319 | "text": [ 320 | "Matching \"((\\w+)/n\\w*)\" and \"Many/ap vi ...\"\n", 321 | "\n", 322 | "SUCCESS\n", 323 | "Desired Output: [('visitors/nns', 'visitors'), ('seats/nns', 'seats'), ('organizer/nn', 'organizer'), ('event/nn', 'event')]\n", 324 | "Your Output: [('visitors/nns', 'visitors'), ('seats/nns', 'seats'), ('organizer/nn', 'organizer'), ('event/nn', 'event')]\n" 325 | ], 326 | "name": "stdout" 327 | } 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": { 333 | "id": "JrmZL4NdV5JL" 334 | }, 335 | "source": [ 336 | "### Exercise 5 - Determiners\n", 337 | "\n", 338 | "Write an expression that matches all determiners (defined as: a, an, the) in the text." 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "metadata": { 344 | "colab": { 345 | "base_uri": "https://localhost:8080/" 346 | }, 347 | "id": "vMv-esVRVb6o", 348 | "outputId": "d0f27f8f-5a3b-49c9-daaa-7810b56c268c" 349 | }, 350 | "source": [ 351 | "text = '''The car and a bike were driving down \n", 352 | "an alley when the accident happened.'''\n", 353 | "desired_output = ['The', 'a', 'an', 'the']\n", 354 | "\n", 355 | "# YOUR CODE GOES HERE\n", 356 | "regular_expression = r'\\b[Aa]\\b|\\b[Aa]n\\b|[Tt]he'\n", 357 | "\n", 358 | "regex_exercise(text, regular_expression, desired_output)" 359 | ], 360 | "execution_count": null, 361 | "outputs": [ 362 | { 363 | "output_type": "stream", 364 | "text": [ 365 | "Matching \"(\\b[Aa]\\b|\\b[Aa]n\\b|[Tt]he)\" and \"The car an ...\"\n", 366 | "\n", 367 | "SUCCESS\n", 368 | "Desired Output: ['The', 'a', 'an', 'the']\n", 369 | "Your Output: ['The', 'a', 'an', 'the']\n" 370 | ], 371 | "name": "stdout" 372 | } 373 | ] 374 | } 375 | ] 376 | } -------------------------------------------------------------------------------- /2021/exercises/Exercises 8-17.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2021/exercises/Exercises 8-17.docx -------------------------------------------------------------------------------- /2021/exercises/Exercises 8-17.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2021/exercises/Exercises 8-17.pdf -------------------------------------------------------------------------------- /2021/exercises/Exercises_8_to_17.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": { 7 | "id": "g8L_jaAr9S8v" 8 | }, 9 | "source": [ 10 | "# Python Programming for Linguists\n", 11 | "**03 - Python for (Corpus) Linguists**\n", 12 | "as of 2021-06-11 (Minor Fixes as of 2023-01-07)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": { 18 | "id": "iJwE7TYEo1_7" 19 | }, 20 | "source": [ 21 | "## 1. Environment and Data" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": { 27 | "id": "VOAC6yCfxu-f" 28 | }, 29 | "source": [ 30 | "Before we begin, we need to set up **our development environment**.\n", 31 | "\n", 32 | "First, we will download (*git cloning*) the workshop repository. The [\"magic command\"](https://ipython.readthedocs.io/en/stable/interactive/magics.html) `%%capture` will suppress any cell output. Be careful: `rm -r python-programming-for-linguists` will delete previous files.\n", 33 | "\n", 34 | "\n", 35 | "Next, we are installing two additional libraries/dependencies: `textdirectory` and `justext`. While many libraries are available on Colab, some need (and can) be installed using `pip`.\n", 36 | "\n", 37 | "Then we are `import`-ing all the needed dependencies.\n", 38 | "\n", 39 | "Finally, we are using two scripts, provided in the repository, to download two corpora.\n", 40 | "\n", 41 | "In addition, we will define a `print_dict` helper function that we will use to look at large dictionaries without breaking *Colab*." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": { 48 | "id": "7mHW5hAMApoA" 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "%%capture\n", 53 | "!rm -r python-programming-for-linguists\n", 54 | "!git clone https://github.com/IngoKl/python-programming-for-linguists" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "id": "PoWWCg--DtgW" 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "%%capture\n", 66 | "!pip install textdirectory --upgrade\n", 67 | "!pip install justext" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "id": "eneM8GARD4Yg" 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "# Basics from Python's standard library\n", 79 | "import re\n", 80 | "import statistics\n", 81 | "import math\n", 82 | "\n", 83 | "from collections import Counter\n", 84 | "from operator import itemgetter\n", 85 | "\n", 86 | "from io import StringIO\n", 87 | "\n", 88 | "# Data Science\n", 89 | "import pandas as pd\n", 90 | "import numpy as np\n", 91 | "\n", 92 | "# Plotting\n", 93 | "import matplotlib.pyplot as plt\n", 94 | "import seaborn as sns\n", 95 | "\n", 96 | "# XML\n", 97 | "import lxml\n", 98 | "\n", 99 | "# NLP\n", 100 | "import nltk\n", 101 | "from nltk.corpus import wordnet\n", 102 | "from nltk.stem import PorterStemmer\n", 103 | "from nltk.stem import LancasterStemmer\n", 104 | "from nltk.stem import WordNetLemmatizer\n", 105 | "from nltk.collocations import BigramCollocationFinder\n", 106 | "from nltk.collocations import BigramAssocMeasures\n", 107 | "\n", 108 | "import spacy\n", 109 | "from spacy import displacy\n", 110 | "\n", 111 | "import textdirectory\n", 112 | "\n", 113 | "# Web\n", 114 | "import requests\n", 115 | "from bs4 import BeautifulSoup\n", 116 | "import justext\n", 117 | "\n", 118 | "# Formatting output\n", 119 | "from tabulate import tabulate" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": { 125 | "id": "TeAeFCirn-cW" 126 | }, 127 | "source": [ 128 | "Downloading two corpora (HUM19UK and COCA sampler)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "id": "m7DAj4DhD-IO" 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "%%capture\n", 140 | "!cd python-programming-for-linguists/2020/data && sh download_hum19uk.sh\n", 141 | "!cd python-programming-for-linguists/2020/data && sh download_coca.sh" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": { 147 | "id": "sJHMf2II4GWY" 148 | }, 149 | "source": [ 150 | "Helper function for looking at large dictionaries:" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "id": "zv1PXbG233Kz" 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "def print_dict(d, top=10):\n", 162 | " print(list(d.items())[0:top])" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": { 168 | "id": "pBzngIdcLEeZ" 169 | }, 170 | "source": [ 171 | "## 2. New Tools and Hints" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": { 177 | "id": "Tb1qBmMkgz4L" 178 | }, 179 | "source": [ 180 | "### Classes and Objects" 181 | ] 182 | }, 183 | { 184 | "cell_type": "markdown", 185 | "metadata": { 186 | "id": "tLAE2PVmJSj5" 187 | }, 188 | "source": [ 189 | "You can think of classes as blueprints for objects. An object, which is an instantiation of a class, can have attributes and methods (basically functions tied to the object). There's lots more to this, but this should get you going!\n", 190 | "\n", 191 | "Here we create a new class `Word`. The class has two attributes (`word` and `length`) as well as one method `reverse`." 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "id": "VpnfgklhJGCA" 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "class Word():\n", 203 | " \n", 204 | " def __init__(self, word):\n", 205 | " self.word = word\n", 206 | " self.length = len(word)\n", 207 | "\n", 208 | " def reverse(self):\n", 209 | " self.word = self.word[::-1]" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "id": "eNUoc8LpJ3Jo" 217 | }, 218 | "outputs": [], 219 | "source": [ 220 | "new_word = Word('cat')" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": { 226 | "id": "gZiwYBoDKFZs" 227 | }, 228 | "source": [ 229 | "Now we have created a new object based on our blueprint. We can access the instance attributes by using `object.attribute`." 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": null, 235 | "metadata": { 236 | "colab": { 237 | "base_uri": "https://localhost:8080/" 238 | }, 239 | "id": "gOsx-klVJ-Aw", 240 | "outputId": "81a5556b-9626-4cc4-c6ba-909b88491ad4" 241 | }, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "('cat', 3)" 247 | ] 248 | }, 249 | "execution_count": 8, 250 | "metadata": { 251 | "tags": [] 252 | }, 253 | "output_type": "execute_result" 254 | } 255 | ], 256 | "source": [ 257 | "new_word.word, new_word.length" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "metadata": { 263 | "id": "qwtBzBdJKYMH" 264 | }, 265 | "source": [ 266 | "Of course, we now also use the methods of the object by calling `object.method()`." 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "colab": { 274 | "base_uri": "https://localhost:8080/" 275 | }, 276 | "id": "015fB5fhKbXC", 277 | "outputId": "9744fd51-1bb7-4117-ac5d-29292c5c4a7f" 278 | }, 279 | "outputs": [ 280 | { 281 | "data": { 282 | "application/vnd.google.colaboratory.intrinsic+json": { 283 | "type": "string" 284 | }, 285 | "text/plain": [ 286 | "'tac'" 287 | ] 288 | }, 289 | "execution_count": 9, 290 | "metadata": { 291 | "tags": [] 292 | }, 293 | "output_type": "execute_result" 294 | } 295 | ], 296 | "source": [ 297 | "new_word.reverse()\n", 298 | "new_word.word" 299 | ] 300 | }, 301 | { 302 | "cell_type": "markdown", 303 | "metadata": { 304 | "id": "q-8lFCNwiEtp" 305 | }, 306 | "source": [ 307 | "### List Comprehensions" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": { 314 | "colab": { 315 | "base_uri": "https://localhost:8080/" 316 | }, 317 | "id": "9G7Z1yWA5tql", 318 | "outputId": "14078e09-4582-4c74-879c-6696bbfa05ae" 319 | }, 320 | "outputs": [ 321 | { 322 | "data": { 323 | "text/plain": [ 324 | "[100, 200, 300]" 325 | ] 326 | }, 327 | "execution_count": 10, 328 | "metadata": { 329 | "tags": [] 330 | }, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "numbers = [10, 20, 30]\n", 336 | "times_ten = [n * 10 for n in numbers]\n", 337 | "\n", 338 | "times_ten" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": { 345 | "colab": { 346 | "base_uri": "https://localhost:8080/" 347 | }, 348 | "id": "ORDQhxIF53nm", 349 | "outputId": "1eff98d3-d0dd-4599-9c94-8e956cd5258c" 350 | }, 351 | "outputs": [ 352 | { 353 | "data": { 354 | "text/plain": [ 355 | "[1, 2, 3]" 356 | ] 357 | }, 358 | "execution_count": 11, 359 | "metadata": { 360 | "tags": [] 361 | }, 362 | "output_type": "execute_result" 363 | } 364 | ], 365 | "source": [ 366 | "list_of_lists = [['A', 1], ['B', 2], ['C', 3]]\n", 367 | "only_first_element = [n[1] for n in list_of_lists]\n", 368 | "\n", 369 | "only_first_element" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": { 375 | "id": "vmGV6kC8ebGF" 376 | }, 377 | "source": [ 378 | "### Enumerate" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "metadata": { 385 | "colab": { 386 | "base_uri": "https://localhost:8080/" 387 | }, 388 | "id": "_zJcldtpedGu", 389 | "outputId": "808114c7-ce2b-4581-915d-d253670c3367" 390 | }, 391 | "outputs": [ 392 | { 393 | "name": "stdout", 394 | "output_type": "stream", 395 | "text": [ 396 | "0 A\n", 397 | "1 B\n", 398 | "2 C\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "l = ['A', 'B', 'C']\n", 404 | "\n", 405 | "for index, value in enumerate(l):\n", 406 | " print(index, value)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "metadata": { 412 | "id": "fGM5yrfSYN0E" 413 | }, 414 | "source": [ 415 | "## 3. Exercises (8 to 17)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": { 421 | "id": "n_BfVcYGKLRw" 422 | }, 423 | "source": [ 424 | "### Exercise 8 – Concordancer" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": { 431 | "id": "7K10f5ATI0qa" 432 | }, 433 | "outputs": [], 434 | "source": [ 435 | "# YOUR CODE GOES HERE" 436 | ] 437 | }, 438 | { 439 | "attachments": {}, 440 | "cell_type": "markdown", 441 | "metadata": { 442 | "id": "ifm5BisVseyt" 443 | }, 444 | "source": [ 445 | "### Exercise 9 – N-Grams\n", 446 | "Note: Number of N-Grams = Tokens + 1 - N" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": { 453 | "id": "r7JBw7IqnF2Z" 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "# YOUR CODE GOES HERE" 458 | ] 459 | }, 460 | { 461 | "attachments": {}, 462 | "cell_type": "markdown", 463 | "metadata": { 464 | "id": "DbitsVl5NDQF" 465 | }, 466 | "source": [ 467 | "### Exercise 10 – Frequency Analysis" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": { 474 | "id": "P95yKU-DnME0" 475 | }, 476 | "outputs": [], 477 | "source": [ 478 | "# YOUR CODE GOES HERE" 479 | ] 480 | }, 481 | { 482 | "attachments": {}, 483 | "cell_type": "markdown", 484 | "metadata": { 485 | "id": "pF9NL_sMl4jQ" 486 | }, 487 | "source": [ 488 | "### Exercise 11 – Computing Basic Statistics" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": { 495 | "id": "0IW9jFTwnRkQ" 496 | }, 497 | "outputs": [], 498 | "source": [ 499 | "# YOUR CODE GOES HERE" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": { 505 | "id": "qYG5cZGAj8Yk" 506 | }, 507 | "source": [ 508 | "### Exercise 12 – Basic Collocation Analysis" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": { 515 | "id": "B_FmWJinkBSa" 516 | }, 517 | "outputs": [], 518 | "source": [ 519 | "# YOUR CODE GOES HERE" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": { 525 | "id": "YDFk4KYuO5NP" 526 | }, 527 | "source": [ 528 | "### Exercise 13 – NLTK Stemming, Lemmatization, and WordNet" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "metadata": { 535 | "id": "rIc1pHatnXlj" 536 | }, 537 | "outputs": [], 538 | "source": [ 539 | "# YOUR CODE GOES HERE" 540 | ] 541 | }, 542 | { 543 | "cell_type": "markdown", 544 | "metadata": { 545 | "id": "l6a71ydw2Dea" 546 | }, 547 | "source": [ 548 | "### Exercise 14 – spaCy Tagging" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": { 555 | "id": "-sU3NCYSncBs" 556 | }, 557 | "outputs": [], 558 | "source": [ 559 | "# YOUR CODE GOES HERE" 560 | ] 561 | }, 562 | { 563 | "attachments": {}, 564 | "cell_type": "markdown", 565 | "metadata": { 566 | "id": "ovBYK7RO2GU2" 567 | }, 568 | "source": [ 569 | "### Exercise 15 – Parsing XML" 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": null, 575 | "metadata": { 576 | "id": "z3se7AiWnfvN" 577 | }, 578 | "outputs": [], 579 | "source": [ 580 | "# YOUR CODE GOES HERE" 581 | ] 582 | }, 583 | { 584 | "attachments": {}, 585 | "cell_type": "markdown", 586 | "metadata": { 587 | "id": "Id18_qZ52OyO" 588 | }, 589 | "source": [ 590 | "### Exercise 16 – Web Scraping" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": null, 596 | "metadata": { 597 | "id": "NmzLJJWunjBe" 598 | }, 599 | "outputs": [], 600 | "source": [ 601 | "# YOUR CODE GOES HERE" 602 | ] 603 | }, 604 | { 605 | "attachments": {}, 606 | "cell_type": "markdown", 607 | "metadata": { 608 | "id": "zmr79eLxY7-Y" 609 | }, 610 | "source": [ 611 | "### Exercise 17 – Putting Everything Together (Keyword Analysis)" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": null, 617 | "metadata": { 618 | "id": "-ffpnkkNnobd" 619 | }, 620 | "outputs": [], 621 | "source": [ 622 | "# YOUR CODE GOES HERE" 623 | ] 624 | } 625 | ], 626 | "metadata": { 627 | "colab": { 628 | "collapsed_sections": [ 629 | "iJwE7TYEo1_7" 630 | ], 631 | "name": "Exercises_8_to_17.ipynb", 632 | "provenance": [], 633 | "toc_visible": true 634 | }, 635 | "kernelspec": { 636 | "display_name": "base", 637 | "language": "python", 638 | "name": "python3" 639 | }, 640 | "language_info": { 641 | "name": "python", 642 | "version": "3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]" 643 | }, 644 | "vscode": { 645 | "interpreter": { 646 | "hash": "3a07b995cb36b494b18415e0955c16896c9f4d40d49bdfea70e60aad8810a43a" 647 | } 648 | } 649 | }, 650 | "nbformat": 4, 651 | "nbformat_minor": 0 652 | } 653 | -------------------------------------------------------------------------------- /2021/learning-path-beginner-long.md: -------------------------------------------------------------------------------- 1 | # Learning Path - Absolute Beginner (Long) 2 | 3 | Welcome to the *Absolute Beginner (Long)* learning path. This is also the *default path* for this workshop. 4 | 5 | Before you proceed, please make sure that you have read the introductory [README.md](https://github.com/IngoKl/python-programming-for-linguists/blob/main/README.md) file outlining this course. If you are eager, this would also be a great opportunity to have a look at the [*Markdown Primer*](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Markdown_Primer.md) to learn something about Markdown. 6 | 7 | ## First Steps 8 | 9 | Before starting the actual workshop, you will have to make a decision. You can either use *Google Colab*, a browser-based development environment, or your own local environment (on your computer) for coding along. While the first option is a lot easier, more comfortable, and recommended, you will be required to have or create a Google account. 10 | 11 | If you are willing to use or create a Google account, proceed as follows: 12 | 13 | 1. Watch the video ["Python Programming for Absolute Beginners"](https://www.youtube.com/watch?v=4UnF45lniyY). 14 | 2. Watch the video ["Getting Started with Google Colab"](https://www.youtube.com/watch?v=JxjUEvQSFkU). 15 | 16 | You can also flip these two videos around if you are eager to start coding as fast as possible! 17 | 18 | If you don't want to rely on Google, you will have to set up your own development environment locally. This is a little bit tricky, but you'll be fine! In this case, proceed as follows: 19 | 20 | 1. Watch the video ["Python Programming for Absolute Beginners"](https://www.youtube.com/watch?v=4UnF45lniyY). 21 | 2. Watch the video ["Setting Up Your Development Environment (Windows)"](https://www.youtube.com/watch?v=xrXEouns3fg). 22 | 23 | ## Let's Get Coding 24 | 25 | You should now have a first understanding of the Python programming language as well as an environment in which you can code. *Awesome!* 🎉 26 | If you have the time, have a brief look at the [*Commenting in Python*](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Commenting_in_Python.md) in order to learn how to document your code a little bit better. 27 | 28 | Now you will go through a series of videos and exercises. These will teach your Python and also prepare you for the live session (and/or the recording of it) as well as the more challenging exercises down the line. 29 | 30 | 3. Familiarize yourself with Python notebooks and try to solve [exercises 1 to 3](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%201-3.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_1_3.ipynb)). 31 | 4. Watch the video ["The Pizza Problem"](https://www.youtube.com/watch?v=g9tOyVI5B3E). 32 | 5. Try to solve [exercises 4 and 5](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%204-5.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_4_5.ipynb)). 33 | 34 | If you are unfamiliar with *Regular Expressions*, now would be a great time to have a look at the [*RegEx Primer for Linguistics*](https://www.youtube.com/watch?v=p7-QkwOU9RY). After watching the video, you should be prepared to take a shot at the [Additional Exercises on Regular Expressions](https://www.youtube.com/watch?v=GGEveroG3Fgs). 35 | 36 | Let's get back to the regular program: 37 | 38 | 6. Watch the video ["Working with Files, Texts, and Regular Expressions"](https://www.youtube.com/watch?v=y37_JvSY-GM). 39 | 7. Try to solve [exercises 6 and 7](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%206-7.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_6_7.ipynb)). 40 | 41 | Now we are approaching the biggest set of exercises in this workshop. These exercises (Exercises 8 to 17) are very challenging and will be solved during the live session. Of course, you can also go through them by watching the recording. In these exercises, we will recreate some common tools used in corpus linguistics as well as work with some real-life data. 42 | 43 | Either way, you should prepare a little before attempting them: 44 | 45 | 8. Have a look at the [*(Linux) Command Line Primer*](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Command_Line_Primer.md). 46 | 9. Have a look at the [*New Tools and Syntax*](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/notebooks/03_New_Tools_and_Syntax.ipynb) notebook and play around with these new tools in your toolbox. 47 | 48 | 10. Now you should be ready for [*Exercises 8 to 17*](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/exercises/Exercises%208-17.pdf). Have a look at them, but don't get scared! 😱 These are rather challenging, and you are not supposed to solve them on your own just now! That said, especially if you are planning to attend the live session, having a look at them beforehand will help during the session. If you are working through the workshop at your own pace, have a look at the [recording](https://youtu.be/DLyVL0mEISU) and work through the exercises. There are [solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2022/exercises/Solutions_Exercises_8_17.ipynb) available for you to consult and experiment with. 49 | 50 | *Congratulations, you are almost finished!* 😃 51 | 52 | 11. Finally, watch the video ["Summary and Resources"](https://www.youtube.com/watch?v=ajKqESDmrKc). 53 | 54 | Now that you have worked through most of the materials feel free to have a look at the [Bonus Notebooks](https://github.com/IngoKl/python-programming-for-linguists/tree/main/Bonus%20Notebooks) and the [List of Additional Resources](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Links_to_Resources.md). 55 | -------------------------------------------------------------------------------- /2021/learning-path-beginner-short.md: -------------------------------------------------------------------------------- 1 | # Learning Path - Absolute Beginner (Short) 2 | 3 | Welcome to the *Absolute Beginner (Long)* learning path. This path will skip some less essential steps and materials. 4 | 5 | Before you proceed, please make sure that you have read the introductory [README.md](https://github.com/IngoKl/python-programming-for-linguists/blob/main/README.md) file outlining this course. 6 | 7 | ## First Steps 8 | 9 | Before starting the actual workshop, you will have to make a decision. You can either use *Google Colab*, a browser-based development environment, or your own local environment (on your computer) for coding along. While the first option is a lot easier, more comfortable, and recommended, you will be required to have or create a Google account. 10 | 11 | If you are willing to use or create a Google account, proceed as follows: 12 | 13 | 1. Watch the video ["Python Programming for Absolute Beginners"](https://www.youtube.com/watch?v=4UnF45lniyY). 14 | 2. Watch the video ["Getting Started with Google Colab"](https://www.youtube.com/watch?v=JxjUEvQSFkU). 15 | 16 | You can also flip these two videos around if you are eager to start coding as fast as possible! 17 | 18 | If you don't want to rely on Google, you will have to set up your own development environment locally. This is a little bit tricky, but you'll be fine! In this case, proceed as follows: 19 | 20 | 1. Watch the video ["Python Programming for Absolute Beginners"](https://www.youtube.com/watch?v=4UnF45lniyY). 21 | 2. Watch the video ["Setting Up Your Development Environment (Windows)"](https://www.youtube.com/watch?v=xrXEouns3fg). 22 | 23 | ## Let's Get Coding 24 | 25 | You should now have a first understanding of the Python programming language as well as an environment in which you can code. *Awesome!* 🎉 26 | 27 | Now you will go through a series of videos and exercises. These will teach your Python and also prepare you for the live session (and/or the recording of it) as well as the more challenging exercises down the line. 28 | 29 | 3. Familiarize yourself with Python notebooks and try to solve [exercises 1 to 3](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%201-3.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_1_3.ipynb)). 30 | 4. Watch the video ["The Pizza Problem"](https://www.youtube.com/watch?v=g9tOyVI5B3E). 31 | 5. Try to solve [exercises 4 and 5](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%204-5.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_4_5.ipynb)). 32 | 6. Watch the video ["Working with Files, Texts, and Regular Expressions"](https://www.youtube.com/watch?v=y37_JvSY-GM). 33 | 7. Try to solve [exercises 6 and 7](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%206-7.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_6_7.ipynb)). 34 | 35 | Now we are approaching the biggest set of exercises in this workshop. These exercises (Exercises 8 to 17) are very challenging and will be solved during the live session. Of course, you can also go through them by watching the recording. In these exercises, we will recreate some common tools used in corpus linguistics as well as work with some real-life data. 36 | 37 | Either way, you should prepare a little before attempting them: 38 | 39 | 8. Have a look at the [*New Tools and Syntax*](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/notebooks/03_New_Tools_and_Syntax.ipynb) notebook and play around with these new tools in your toolbox. 40 | 41 | 9. Now you should be ready for [*Exercises 8 to 17*](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/exercises/Exercises%208-17.pdf). Have a look at them, but don't get scared! 😱 These are rather challenging, and you are not supposed to solve them on your own just now! That said, especially if you are planning to attend the live session, having a look at them beforehand will help during the session. If you are working through the workshop at your own pace, have a look at the [recording](https://youtu.be/DLyVL0mEISU) and work through the exercises. There are [solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2022/exercises/Solutions_Exercises_8_17.ipynb) available for you to consult and experiment with. 42 | 43 | *Congratulations, you are almost finished!* 😃 44 | 45 | 10. Finally, watch the video ["Summary and Resources"](https://www.youtube.com/watch?v=ajKqESDmrKc). 46 | 47 | Now that you have worked through most of the materials feel free to have a look at the [Bonus Notebooks](https://github.com/IngoKl/python-programming-for-linguists/tree/main/Bonus%20Notebooks) and the [List of Additional Resources](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Links_to_Resources.md). 48 | -------------------------------------------------------------------------------- /2021/learning-path-experienced.md: -------------------------------------------------------------------------------- 1 | # Learning Path - Experienced in Python 2 | 3 | Welcome to the *Experienced in Python* learning path. This is the shortest learning path and will skip over most introductory material. This document, compared to the beginner's version, is also very matter-of-fact. If you are looking for a little bit more handholding, have a look at the other learning paths. 😊 4 | 5 | Before you proceed, please make sure that you have read the introductory [README.md](https://github.com/IngoKl/python-programming-for-linguists/blob/main/README.md) file outlining this course. 6 | 7 | Since you already have some experience with Python, I assume that you are able to run *Jupyter Notebooks* either locally or in the cloud (e.g., *Google Colab*). 8 | 9 | ## Let's Get Coding 10 | 11 | Having previous Python knowledge will allow you to skip the first few videos and exercises. 12 | 13 | If you are unfamiliar with *Regular Expressions*, now would be a great time to have a look at the [*RegEx Primer for Linguistics*](https://www.youtube.com/watch?v=p7-QkwOU9RY). After watching the video, you should be prepared to take a shot at the [Additional Exercises on Regular Expressions](https://www.youtube.com/watch?v=GGEveroG3Fgs). 14 | 15 | 1. Watch the video ["Working with Files, Texts, and Regular Expressions"](https://www.youtube.com/watch?v=y37_JvSY-GM). 16 | 2. Try to solve [exercises 6 and 7](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%206-7.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_6_7.ipynb)). 17 | 18 | Now we are approaching the biggest set of exercises in this workshop. These exercises (Exercises 8 to 17) are very challenging and will be solved during the live session. Of course, you can also go through them by watching the recording. In these exercises, we will recreate some common tools used in corpus linguistics as well as work with some real-life data. 19 | 20 | If you have knowledge about intermediate Python concepts (e.g., List Comprehensions and Pandas `DataFrames`) you can safely skip to the Exercises. If not, have a look at the [*New Tools and Syntax*](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/notebooks/03_New_Tools_and_Syntax.ipynb) notebook and play around with these new tools in your toolbox. 21 | 22 | 3. Now you should be ready for [*Exercises 8 to 17*](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/exercises/Exercises%208-17.pdf). Have a look at them, but don't get scared! 😱 These are rather challenging, and you are not supposed to solve them on your own just now! That said, especially if you are planning to attend the live session, having a look at them beforehand will help during the session. If you are working through the workshop at your own pace, have a look at the [recording](https://youtu.be/DLyVL0mEISU) and work through the exercises. 23 | 24 | *Congratulations, you are almost finished!* 😃 25 | 26 | 4. Finally, watch the video ["Summary and Resources"](https://www.youtube.com/watch?v=ajKqESDmrKc). 27 | 28 | Now that you have worked through most of the materials feel free to have a look at the [Bonus Notebooks](https://github.com/IngoKl/python-programming-for-linguists/tree/main/Bonus%20Notebooks) and the [List of Additional Resources](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Links_to_Resources.md). 29 | -------------------------------------------------------------------------------- /2021/notebooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2021/notebooks/.gitkeep -------------------------------------------------------------------------------- /2021/scripts/my_functions.R: -------------------------------------------------------------------------------- 1 | to_lowercase <- function(s) { 2 | lower <- tolower(s) 3 | print(lower) 4 | } 5 | 6 | add_tree <- function(a, b, c) { 7 | return(a + b + c) 8 | } -------------------------------------------------------------------------------- /2021/slides/03 - Python for (Corpus) Linguists.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2021/slides/03 - Python for (Corpus) Linguists.pdf -------------------------------------------------------------------------------- /2021/slides/03 - Python for (Corpus) Linguists.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2021/slides/03 - Python for (Corpus) Linguists.pptx -------------------------------------------------------------------------------- /2021/slides/06 - Google Colab.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2021/slides/06 - Google Colab.pdf -------------------------------------------------------------------------------- /2021/slides/06 - Google Colab.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2021/slides/06 - Google Colab.pptx -------------------------------------------------------------------------------- /2021/slides/Additional Exercises.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2021/slides/Additional Exercises.pptx -------------------------------------------------------------------------------- /2021/youtube-video-descriptions.md: -------------------------------------------------------------------------------- 1 | # YouTube Video Descriptions (2021) 2 | 3 | ## 06 - Getting Started with Google Colab 4 | 5 | This video is part of my 2021 "Python Programming for Linguists" workshop. 6 | Workshop: https://github.com/IngoKl/python-programming-for-linguists 7 | 8 | Content 9 | 00:00 Welcome 10 | 00:26 Introduction to Google Colab 11 | 01:52 Google Colab 12 | 02:35 Creating New Notebooks 13 | 04:15 Running System/Shell Commands 14 | 05:10 GitHub Repositories 15 | 05:55 Uploading Notebooks 16 | 17 | ## Additional Exercises - Regular Expressions 18 | 19 | This video is part of my 2021 "Python Programming for Linguists" workshop. 20 | Workshop: https://github.com/IngoKl/python-programming-for-linguists 21 | 22 | Content 23 | 00:00 Welcome 24 | 00:16 RegEx Refresher 25 | 02:04 Intro to Exercises 26 | 05:05 Exercise 1 27 | 07:40 Exercise 2 28 | 08:28 Exercise 3 29 | 11:15 Exercise 4 30 | 14:23 Exercise 5 -------------------------------------------------------------------------------- /2022/slides/03 - Python for (Corpus) Linguists.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2022/slides/03 - Python for (Corpus) Linguists.pdf -------------------------------------------------------------------------------- /2022/slides/03 - Python for (Corpus) Linguists.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/2022/slides/03 - Python for (Corpus) Linguists.pptx -------------------------------------------------------------------------------- /2022/youtube-video-descriptions.md: -------------------------------------------------------------------------------- 1 | # YouTube Video Descriptions (2022) 2 | 3 | ## Playlist (Workshop: Python Programming for Linguists) 4 | 5 | In this workshop, consisting of several videos and exercises, you will be introduced to Python and its application within (corpus) linguistics. After a short general introduction to programming as well as Python, we will utilize Python to solve several (corpus) linguistic exercises. 6 | 7 | Workshop: https://github.com/IngoKl/python-programming-for-linguists 8 | 9 | ## 03 - Python for (Corpus) Linguists (2022) 10 | 11 | This video is part of my "Python Programming for Linguists" workshop. 12 | Workshop: https://github.com/IngoKl/python-programming-for-linguists 13 | 14 | First, this video introduces some new Python/programming concepts and essential tools. Then, we are going to work through exercises 8 to 17. 15 | 16 | The finished and somewhat polished Python notebook (Google Colab), as well as the slides, are available in the repository. 17 | 18 | Content 19 | 00:00 Introduction 20 | 01:04 Overview 21 | 02:36 Note on ChatGPT and LLMs 22 | 05:32 Concept: Importing + Environment 23 | 10:19 Concept: Classes and Methods 24 | 14:57 Concept: List Comprehensions 25 | 18:57 Concept: Enumerate 26 | 20:54 Tool/Library: ftfy (Fixing Unicode) 27 | 22:27 Concept: Pandas and DataFrames 28 | 24:30 Tool/Library: TextDirectory 29 | 25:53 Ex. 8 – Concordancer 30 | 33:23 Concept: join 31 | 34:18 Concept: Slicing Strings 32 | 43:35 Ex. 9 – N-Grams 33 | 48:39 ChatGPT Solution 34 | 51:32 Ex. 10 – Frequency Analysis 35 | 52:18 Concept: Counter 36 | 01:04:34 Tool/Library: spaCy 37 | 01:09:42 Ex. 11 – Computing Basic Statistics 38 | 01:11:08 Concept: Lists and Sets 39 | 01:12:06 NLP Concept: Vocabulary 40 | 01:31:45 Ex. 12 – Basic Collocation Analysis 41 | 01:35:14 CL Concept: Collocation and MI-score 42 | 01:41:55 Ex. 13 – NLTK Stemming, Lemmatization, and WordNet 43 | 01:42:52 Database: WordNet + Synsets 44 | 01:48:58 Ex. 14 – spaCy Tagging 45 | 01:55:58 Ex. 15 – Parsing XML 46 | 01:57:10 Concept: XML and XPath 47 | 02:00:55 Tool/Library: LXML 48 | 02:02:50 Ex. 16 – Web Scraping 49 | 02:04:00 Tool/Library: requests + HTTP GET 50 | 02:09:55 Tool/Library: jusText 51 | 02:11:25 Ex. 17 – Putting Everything Together (Keyword Analysis) 52 | 02:19:25 Concept: Lambda Functions 53 | 02:21:10 CL Concept: Simple Maths Parameter -------------------------------------------------------------------------------- /Bonus Notebooks/Pizza_Problem_Dataclass_Solution.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Pizza_Problem_Dataclass_Solution.ipynb", 7 | "provenance": [] 8 | }, 9 | "kernelspec": { 10 | "name": "python3", 11 | "display_name": "Python 3" 12 | }, 13 | "language_info": { 14 | "name": "python" 15 | } 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "pa2QujQSwJC3" 22 | }, 23 | "source": [ 24 | "This optimized solution to the Pizza Problem makes use of Python's advanced `dataclass` feature. In order to understand what's going on, you need a solid understanding of classes. Also, have a look at this [*Real Python* article](https://realpython.com/python-data-classes/) for an introduction to dataclasses." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "metadata": { 30 | "id": "lSLTbw6g0z2E" 31 | }, 32 | "source": [ 33 | "import math\n", 34 | "from dataclasses import dataclass, field" 35 | ], 36 | "execution_count": 1, 37 | "outputs": [] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "metadata": { 42 | "id": "Dwnjkvc6lqr2" 43 | }, 44 | "source": [ 45 | "@dataclass\n", 46 | "class Pizza:\n", 47 | " name: str = 'pizza'\n", 48 | " size: list = field(default_factory=lambda: [26, 0])\n", 49 | " price: float = 5.0\n", 50 | " shape: str = 'circle'\n", 51 | " \n", 52 | " pter: float\n", 53 | " _pter: float = field(init=False, repr=False)\n", 54 | "\n", 55 | " @property\n", 56 | " def area(self) -> float:\n", 57 | " if self.shape == 'circle':\n", 58 | " return round(math.pi * (self.size[0] / 2) ** 2)\n", 59 | " if self.shape == 'rectangle':\n", 60 | " return round(self.size[0] * self.size[1])\n", 61 | "\n", 62 | " @property\n", 63 | " def pter(self) -> float:\n", 64 | " return round(self._pter)\n", 65 | "\n", 66 | " @pter.setter\n", 67 | " def pter(self, pter: float) -> float:\n", 68 | " self._pter = self.area / self.price" 69 | ], 70 | "execution_count": 1, 71 | "outputs": [] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "metadata": { 76 | "id": "VK56kQmSmHgz" 77 | }, 78 | "source": [ 79 | "pizzas = [\n", 80 | " Pizza('small', [26, 0], 4.80, 'circle'),\n", 81 | " Pizza('large', [30, 0], 5.50, 'circle'),\n", 82 | " Pizza('party', [46, 33], 13.00, 'rectangle'),\n", 83 | "]" 84 | ], 85 | "execution_count": 1, 86 | "outputs": [] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "metadata": { 91 | "colab": { 92 | "base_uri": "https://localhost:8080/" 93 | }, 94 | "id": "t-G1GEx-rMJg", 95 | "outputId": "c736743a-798b-4c08-fa55-7e2826b876a1" 96 | }, 97 | "source": [ 98 | "max(pizzas, key=lambda pizza: pizza.pter)" 99 | ], 100 | "execution_count": 1, 101 | "outputs": [ 102 | { 103 | "output_type": "execute_result", 104 | "data": { 105 | "text/plain": [ 106 | "Pizza(name='large', size=[30, 0], price=5.5, shape='circle', pter=129)" 107 | ] 108 | }, 109 | "metadata": { 110 | "tags": [] 111 | }, 112 | "execution_count": 1 113 | } 114 | ] 115 | } 116 | ] 117 | } -------------------------------------------------------------------------------- /Bonus Notebooks/Understanding_Classes_and_Objects.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Bonus - Understanding Classes and Objects.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | } 14 | }, 15 | "cells": [ 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "nx5gA92Y5Yyv" 20 | }, 21 | "source": [ 22 | "# Bonus - Understanding Classes and Objects\r\n", 23 | "\r\n", 24 | "Over the course of the workshop, we encountered both **objects** and **methods** from time to time. However, to not make things even more complicated, I did not really discuss what these are.\r\n", 25 | "\r\n", 26 | "Since you have clicked on this notebook, you will now receive a **very** brief introduction to classes, objects, and methods. If you want to dive deeper, have a look at [Object-Oriented Programming (OOP)](https://en.wikipedia.org/wiki/Object-oriented_programming#:~:text=Object%2Doriented%20programming%20(OOP),(often%20known%20as%20methods).)." 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "nFnDm4RP6alo" 33 | }, 34 | "source": [ 35 | "## Classes\r\n", 36 | "\r\n", 37 | "Below you can see an example of a simple `class`. You can think of classes as blueprints for objects that can have attributes and methods." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "metadata": { 43 | "id": "QvT7Dohs5TF-" 44 | }, 45 | "source": [ 46 | "class Cat:\r\n", 47 | " def __init__(self, name, age, color):\r\n", 48 | " self.name = name\r\n", 49 | " self.age = age\r\n", 50 | " self.color = color\r\n", 51 | "\r\n", 52 | " self.hunger = 100\r\n", 53 | " \r\n", 54 | " def __str__(self):\r\n", 55 | " if self.hunger > 50:\r\n", 56 | " return f'My name is {self.name}. I am a {self.age} years old {self.color} cat. I am also quite hungry.'\r\n", 57 | " else:\r\n", 58 | " return f'My name is {self.name}. I am a {self.age} years old {self.color} cat. I am not hungry!.'\r\n", 59 | "\r\n", 60 | " def eat_mouse(self):\r\n", 61 | " print('I just ate a mouse!')\r\n", 62 | " self.hunger -= 10\r\n" 63 | ], 64 | "execution_count": null, 65 | "outputs": [] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": { 70 | "id": "YX708Wqu88L3" 71 | }, 72 | "source": [ 73 | "We now have created a blueprint for a 'cat'. The resulting cat will have four attributes (`name`, `age`, `color`, and `hunger`) and two methods (`__str__` and `eat_mouse`). You can think of methods as functions that are tied to an object.\r\n", 74 | "\r\n", 75 | "The `__str__` method is a bit special. It will be automatically called if we try to `print` an instance of this class (an object). But we're getting ahead of ourselves..." 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": { 81 | "id": "NX7xZyQnAs6U" 82 | }, 83 | "source": [ 84 | "## Objects" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "metadata": { 90 | "id": "yrny-rQc9vk5" 91 | }, 92 | "source": [ 93 | "cleo = Cat('Cleo', 2, 'black')" 94 | ], 95 | "execution_count": null, 96 | "outputs": [] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": { 101 | "id": "ItR3FKfW9_c_" 102 | }, 103 | "source": [ 104 | "We have just created an object (a *cat*) based on our blueprint. `cleo` is now an object based on the class (blueprint). Let's try to `print cleo`." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "metadata": { 110 | "colab": { 111 | "base_uri": "https://localhost:8080/" 112 | }, 113 | "id": "DzH6pPoh-OJy", 114 | "outputId": "9ff0e8e0-60aa-430c-8b95-b7e498d8a0ad" 115 | }, 116 | "source": [ 117 | "print(cleo)" 118 | ], 119 | "execution_count": null, 120 | "outputs": [ 121 | { 122 | "output_type": "stream", 123 | "text": [ 124 | "My name is Cleo. I am a 2 years old black cat. I am also quite hungry.\n" 125 | ], 126 | "name": "stdout" 127 | } 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": { 133 | "id": "q0tkqJ2h-0qn" 134 | }, 135 | "source": [ 136 | "As you can see, once we tried to `print` the object. the `__str__` method got called. We can now also access and change Cleo's attributes." 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "metadata": { 142 | "colab": { 143 | "base_uri": "https://localhost:8080/" 144 | }, 145 | "id": "gNkwLHv8--tS", 146 | "outputId": "fbaa033c-91c8-4705-dcfc-ad9b22a3796d" 147 | }, 148 | "source": [ 149 | "cleo.age = 3\r\n", 150 | "print(cleo)" 151 | ], 152 | "execution_count": null, 153 | "outputs": [ 154 | { 155 | "output_type": "stream", 156 | "text": [ 157 | "My name is Cleo. I am a 3 years old black cat. I am also quite hungry.\n" 158 | ], 159 | "name": "stdout" 160 | } 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": { 166 | "id": "zbKqzvF7_JMH" 167 | }, 168 | "source": [ 169 | "Changing attributes is great! We can, of course, also call methods. For examle our `eat_mouse` methods." 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "metadata": { 175 | "colab": { 176 | "base_uri": "https://localhost:8080/" 177 | }, 178 | "id": "cn1O_S5V_QCz", 179 | "outputId": "03a09718-3a75-4deb-cbd2-5448cfbe32f8" 180 | }, 181 | "source": [ 182 | "cleo.eat_mouse()" 183 | ], 184 | "execution_count": null, 185 | "outputs": [ 186 | { 187 | "output_type": "stream", 188 | "text": [ 189 | "I just ate a mouse!\n" 190 | ], 191 | "name": "stdout" 192 | } 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": { 198 | "id": "N6OkLVHb_euS" 199 | }, 200 | "source": [ 201 | "Now that Cleo has eaten a mouse, she should be less hungry as eating a mouse reduces `hunger` by 10." 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "metadata": { 207 | "colab": { 208 | "base_uri": "https://localhost:8080/" 209 | }, 210 | "id": "VoxObORD_jcD", 211 | "outputId": "0a0e64c3-da6c-40ba-d55a-616b5d29d050" 212 | }, 213 | "source": [ 214 | "cleo.hunger" 215 | ], 216 | "execution_count": null, 217 | "outputs": [ 218 | { 219 | "output_type": "execute_result", 220 | "data": { 221 | "text/plain": [ 222 | "90" 223 | ] 224 | }, 225 | "metadata": { 226 | "tags": [] 227 | }, 228 | "execution_count": 34 229 | } 230 | ] 231 | }, 232 | { 233 | "cell_type": "markdown", 234 | "metadata": { 235 | "id": "nTWshN0d_pnA" 236 | }, 237 | "source": [ 238 | "Let's feed her a couple more mice ..." 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "metadata": { 244 | "colab": { 245 | "base_uri": "https://localhost:8080/" 246 | }, 247 | "id": "XBNboqlI_u7K", 248 | "outputId": "0ce3e443-a53f-43a8-b7bc-9d332326bb8b" 249 | }, 250 | "source": [ 251 | "cleo.eat_mouse()\r\n", 252 | "cleo.eat_mouse()\r\n", 253 | "cleo.eat_mouse()\r\n", 254 | "cleo.eat_mouse()\r\n", 255 | "cleo.eat_mouse()" 256 | ], 257 | "execution_count": null, 258 | "outputs": [ 259 | { 260 | "output_type": "stream", 261 | "text": [ 262 | "I just ate a mouse!\n", 263 | "I just ate a mouse!\n", 264 | "I just ate a mouse!\n", 265 | "I just ate a mouse!\n", 266 | "I just ate a mouse!\n" 267 | ], 268 | "name": "stdout" 269 | } 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "metadata": { 275 | "colab": { 276 | "base_uri": "https://localhost:8080/" 277 | }, 278 | "id": "ANaPLL-k_z0a", 279 | "outputId": "580474c4-315d-4bb4-dd18-430611bde8c2" 280 | }, 281 | "source": [ 282 | "print(cleo)" 283 | ], 284 | "execution_count": null, 285 | "outputs": [ 286 | { 287 | "output_type": "stream", 288 | "text": [ 289 | "My name is Cleo. I am a 3 years old black cat. I am not hungry!.\n" 290 | ], 291 | "name": "stdout" 292 | } 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": { 298 | "id": "yyHKf41x_07r" 299 | }, 300 | "source": [ 301 | "Finally, Cleo isn't hungry anymore!\r\n", 302 | "\r\n", 303 | "Alright, so we have seen that objects (instances of classes) can have attributes and methods. The beauty of having a blueprint, however, is that we can have an unlimited number of objects created from them." 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "metadata": { 309 | "id": "cCErBGxsAMUn" 310 | }, 311 | "source": [ 312 | "ada = Cat('Ada', 4, 'red')" 313 | ], 314 | "execution_count": null, 315 | "outputs": [] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "metadata": { 320 | "colab": { 321 | "base_uri": "https://localhost:8080/" 322 | }, 323 | "id": "zmNlV1QSAXDU", 324 | "outputId": "3ed3fdac-4ead-49aa-b849-388c0763a680" 325 | }, 326 | "source": [ 327 | "print(ada)" 328 | ], 329 | "execution_count": null, 330 | "outputs": [ 331 | { 332 | "output_type": "stream", 333 | "text": [ 334 | "My name is Ada. I am a 4 years old red cat. I am also quite hungry.\n" 335 | ], 336 | "name": "stdout" 337 | } 338 | ] 339 | }, 340 | { 341 | "cell_type": "markdown", 342 | "metadata": { 343 | "id": "odgL1TF0AZ6x" 344 | }, 345 | "source": [ 346 | "As there is nothing special about objects, we can also, for example, put them into a list and loop over them." 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "metadata": { 352 | "id": "XXZHPUJHAfzm" 353 | }, 354 | "source": [ 355 | "cats = [cleo, ada]" 356 | ], 357 | "execution_count": null, 358 | "outputs": [] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "metadata": { 363 | "colab": { 364 | "base_uri": "https://localhost:8080/" 365 | }, 366 | "id": "Yq8Fu_K5AimR", 367 | "outputId": "b671f9da-e85c-4bfb-e8a3-c792881a5dbd" 368 | }, 369 | "source": [ 370 | "for cat in cats:\r\n", 371 | " print(cat.name, cat.age, cat.hunger)" 372 | ], 373 | "execution_count": null, 374 | "outputs": [ 375 | { 376 | "output_type": "stream", 377 | "text": [ 378 | "Cleo 3 40\n", 379 | "Ada 4 100\n" 380 | ], 381 | "name": "stdout" 382 | } 383 | ] 384 | }, 385 | { 386 | "cell_type": "markdown", 387 | "metadata": { 388 | "id": "01w8snGZBDMa" 389 | }, 390 | "source": [ 391 | "## A Slightly More Useful Example\r\n", 392 | "\r\n", 393 | "Now that we talked a lot about cats let's try to come up with something more useful. Let's say that we want to have a slightly better way of storing and handling documents.\r\n", 394 | "\r\n", 395 | "**Note**: Whenever we create a new object, the `__init__` is automatically being called." 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "metadata": { 401 | "id": "GVdAO-xoCKsE" 402 | }, 403 | "source": [ 404 | "%%capture\r\n", 405 | "!git clone https://github.com/IngoKl/python-programming-for-linguists" 406 | ], 407 | "execution_count": null, 408 | "outputs": [] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "metadata": { 413 | "colab": { 414 | "base_uri": "https://localhost:8080/" 415 | }, 416 | "id": "wLGkU9VPDQzB", 417 | "outputId": "ae9239fe-d3a6-4e1a-8f0e-c0ecffbdb6e8" 418 | }, 419 | "source": [ 420 | "!ls" 421 | ], 422 | "execution_count": null, 423 | "outputs": [ 424 | { 425 | "output_type": "stream", 426 | "text": [ 427 | "python-programming-for-linguists sample_data\n" 428 | ], 429 | "name": "stdout" 430 | } 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "metadata": { 436 | "id": "lhJhAsDPBUpB" 437 | }, 438 | "source": [ 439 | "class Document:\r\n", 440 | " def __init__(self, file):\r\n", 441 | " self.file = file\r\n", 442 | " self.tokens = []\r\n", 443 | " self.token_count = None\r\n", 444 | "\r\n", 445 | " with open(self.file, 'r') as f:\r\n", 446 | " self.text = f.read()\r\n", 447 | "\r\n", 448 | " self.tokenize()\r\n", 449 | "\r\n", 450 | " def __str__(self):\r\n", 451 | " return f'Document created from {self.file} with {self.token_count} tokens.'\r\n", 452 | "\r\n", 453 | " def tokenize(self):\r\n", 454 | " self.tokens = self.text.split()\r\n", 455 | " self.token_count = len(self.tokens)\r\n" 456 | ], 457 | "execution_count": null, 458 | "outputs": [] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": { 463 | "id": "MYgQcN3fEw6f" 464 | }, 465 | "source": [ 466 | "We now have a simple class which, once you create an object, reads a text file and automatically tokenizes it.\r\n", 467 | "\r\n", 468 | "To to this, we have method called `tokenize` which is called straight from the `__init__` method." 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "metadata": { 474 | "id": "XdXmrCjSCZHM" 475 | }, 476 | "source": [ 477 | "cologne = Document('python-programming-for-linguists/2020/data/wikipedia/cologne.txt')" 478 | ], 479 | "execution_count": null, 480 | "outputs": [] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "metadata": { 485 | "colab": { 486 | "base_uri": "https://localhost:8080/" 487 | }, 488 | "id": "V9xt1sbRErXj", 489 | "outputId": "df028d3e-4224-495e-a333-a820a90e045c" 490 | }, 491 | "source": [ 492 | "print(cologne)" 493 | ], 494 | "execution_count": null, 495 | "outputs": [ 496 | { 497 | "output_type": "stream", 498 | "text": [ 499 | "Document created from python-programming-for-linguists/2020/data/wikipedia/cologne.txt with 490 tokens.\n" 500 | ], 501 | "name": "stdout" 502 | } 503 | ] 504 | } 505 | ] 506 | } -------------------------------------------------------------------------------- /Bonus Notebooks/Working_with_R_in_Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Untitled1.ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "tqXFE16ZGVNd" 23 | }, 24 | "source": [ 25 | "# Bonus Notebook: Working with R in Python" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "id": "EooMbfTPGdmV" 32 | }, 33 | "source": [ 34 | "[**R**](https://www.r-project.org), a free software environment for statistical computing and graphing and a programming language, has gained massive traction in (corpus) linguistics over the last few years.\n", 35 | "\n", 36 | "In some cases, it might be helpful to have *Python* and R *interact* – for example if only a small part of your project is written in R.\n", 37 | "\n", 38 | "Fortunately, there is [`rpy2`](https://pypi.org/project/rpy2/) which is a sophisticated Python interface to the *R* language.\n", 39 | "\n", 40 | "*Please Note*: This notebook assumes that you have some working knowledge of R." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "metadata": { 46 | "colab": { 47 | "base_uri": "https://localhost:8080/" 48 | }, 49 | "id": "vFgu5X7EGOVf", 50 | "outputId": "cf218107-5747-4cd0-e9aa-33fce9c41752" 51 | }, 52 | "source": [ 53 | "import rpy2\n", 54 | "import rpy2.robjects as robjects\n", 55 | "import rpy2.robjects.packages as rpackages\n", 56 | "\n", 57 | "\n", 58 | "print(rpy2.__version__)" 59 | ], 60 | "execution_count": 1, 61 | "outputs": [ 62 | { 63 | "output_type": "stream", 64 | "text": [ 65 | "3.4.3\n" 66 | ], 67 | "name": "stdout" 68 | } 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "id": "aX-ltQlrH73q" 75 | }, 76 | "source": [ 77 | "## Example 1: Accessing the Embedded R\n", 78 | "\n", 79 | "`rpy2`, under the hood, is running an embedded version of *R*. Also, it provides us with an object (`robjects.r`) that we can use to interface with this *R*. In the example below, we are accessing the `pi` symbol from R." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "metadata": { 85 | "colab": { 86 | "base_uri": "https://localhost:8080/", 87 | "height": 57 88 | }, 89 | "id": "-03NKYI3IVaO", 90 | "outputId": "465bf4b9-99ab-4b7d-d3d9-fbea620a3bc5" 91 | }, 92 | "source": [ 93 | "# R equivalent: > pi\n", 94 | "r_pi = robjects.r['pi']\n", 95 | "\n", 96 | "r_pi" 97 | ], 98 | "execution_count": 2, 99 | "outputs": [ 100 | { 101 | "output_type": "execute_result", 102 | "data": { 103 | "text/html": [ 104 | "\n", 105 | " FloatVector with 1 elements.\n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | "
\n", 111 | " 3.141593\n", 112 | "
\n", 117 | " " 118 | ], 119 | "text/plain": [ 120 | " [RTYPES.REALSXP]\n", 121 | "R classes: ('numeric',)\n", 122 | "[3.141593]" 123 | ] 124 | }, 125 | "metadata": { 126 | "tags": [] 127 | }, 128 | "execution_count": 2 129 | } 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": { 135 | "id": "tpdjArxRKFlW" 136 | }, 137 | "source": [ 138 | "In the example above, you should note that we get a `FloatVector` back. If you just need the number, you need to access `r_pi[0]`." 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "metadata": { 144 | "colab": { 145 | "base_uri": "https://localhost:8080/" 146 | }, 147 | "id": "0xLxWtJlKOIP", 148 | "outputId": "eaf8b116-70ab-46c7-8c66-63c800eed995" 149 | }, 150 | "source": [ 151 | "r_pi[0]" 152 | ], 153 | "execution_count": 3, 154 | "outputs": [ 155 | { 156 | "output_type": "execute_result", 157 | "data": { 158 | "text/plain": [ 159 | "3.141592653589793" 160 | ] 161 | }, 162 | "metadata": { 163 | "tags": [] 164 | }, 165 | "execution_count": 3 166 | } 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "id": "8s6PLLo4JCMU" 173 | }, 174 | "source": [ 175 | "## Example 2: Writing an R Function\n", 176 | "\n", 177 | "We are also able to write function in *R* and then run them 'in'/using *Python*." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "metadata": { 183 | "colab": { 184 | "base_uri": "https://localhost:8080/" 185 | }, 186 | "id": "pZri801fJNxp", 187 | "outputId": "9e83fd3f-cfca-46d0-cec7-6bf90ebebb0b" 188 | }, 189 | "source": [ 190 | "# A simple R function that will lowercase and print an input\n", 191 | "robjects.r('''\n", 192 | " to_lowercase <- function(s) {\n", 193 | " lower <- tolower(s)\n", 194 | " print(lower)\n", 195 | " }\n", 196 | "''')" 197 | ], 198 | "execution_count": 4, 199 | "outputs": [ 200 | { 201 | "output_type": "execute_result", 202 | "data": { 203 | "text/plain": [ 204 | " [RTYPES.CLOSXP]\n", 205 | "R classes: ('function',)" 206 | ] 207 | }, 208 | "metadata": { 209 | "tags": [] 210 | }, 211 | "execution_count": 4 212 | } 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "metadata": { 218 | "colab": { 219 | "base_uri": "https://localhost:8080/", 220 | "height": 75 221 | }, 222 | "id": "AHlbhsriJzwo", 223 | "outputId": "a34e6ce8-93cc-48e5-8630-a56c0a8af95e" 224 | }, 225 | "source": [ 226 | "# Retrieve the function\n", 227 | "r_to_lowercase = robjects.r['to_lowercase']\n", 228 | "\n", 229 | "# Run our R function\n", 230 | "r_to_lowercase('This IS a TeSt')" 231 | ], 232 | "execution_count": 5, 233 | "outputs": [ 234 | { 235 | "output_type": "stream", 236 | "text": [ 237 | "[1] \"this is a test\"\n" 238 | ], 239 | "name": "stdout" 240 | }, 241 | { 242 | "output_type": "execute_result", 243 | "data": { 244 | "text/html": [ 245 | "\n", 246 | " StrVector with 1 elements.\n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | "
\n", 252 | " 'this is a test'\n", 253 | "
\n", 258 | " " 259 | ], 260 | "text/plain": [ 261 | " [RTYPES.STRSXP]\n", 262 | "R classes: ('character',)\n", 263 | "['this is a test']" 264 | ] 265 | }, 266 | "metadata": { 267 | "tags": [] 268 | }, 269 | "execution_count": 5 270 | } 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": { 276 | "id": "ZKXfj0iDKi89" 277 | }, 278 | "source": [ 279 | "### Example 3: Working with R Packages\n", 280 | "\n", 281 | "One of the most powerful things about *R* is the rich ecosystem of packages." 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": { 287 | "id": "1p4WdcywLPsK" 288 | }, 289 | "source": [ 290 | "For this example, we will be using [`tau`](https://cran.r-project.org/web/packages/tau/index.html), a text processing utility package. To do so, we first have to install this package into our embedded *R*." 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "metadata": { 296 | "id": "AYQQneAlKu9D" 297 | }, 298 | "source": [ 299 | "utils = rpackages.importr('utils')\n", 300 | "utils.install_packages('tau')" 301 | ], 302 | "execution_count": null, 303 | "outputs": [] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": { 308 | "id": "VYK_toDQLgNs" 309 | }, 310 | "source": [ 311 | "Now, we can import `tau` and work with it!" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "metadata": { 317 | "id": "NGZneLPbLLbF" 318 | }, 319 | "source": [ 320 | "tau = rpackages.importr('tau')" 321 | ], 322 | "execution_count": 7, 323 | "outputs": [] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "metadata": { 328 | "colab": { 329 | "base_uri": "https://localhost:8080/" 330 | }, 331 | "id": "8MWMN3soLkzp", 332 | "outputId": "1826cfa5-d7ae-4e28-831e-32c193df5523" 333 | }, 334 | "source": [ 335 | "tokenized = tau.tokenize('This is a test.')\n", 336 | "\n", 337 | "for token in tokenized:\n", 338 | " print(token)" 339 | ], 340 | "execution_count": 8, 341 | "outputs": [ 342 | { 343 | "output_type": "stream", 344 | "text": [ 345 | "This\n", 346 | " \n", 347 | "is\n", 348 | " \n", 349 | "a\n", 350 | " \n", 351 | "test\n", 352 | ".\n" 353 | ], 354 | "name": "stdout" 355 | } 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": { 361 | "id": "pZxDBTE_L9pi" 362 | }, 363 | "source": [ 364 | "# Example 4: Working with .R Files\n", 365 | "\n", 366 | "In many cases, there will be a `.R` file. For example, if a colleague has written an interesting or useful function in *R* which you want to repurpose in your *Python*.\n", 367 | "\n", 368 | "For this example, we will be using the same `to_lowercase` function from before. However, this time the function will reside in an external file called `my_functions.R`. In addition, the same file will also contain a second function called `add_three`." 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "metadata": { 374 | "colab": { 375 | "base_uri": "https://localhost:8080/" 376 | }, 377 | "id": "EAHuhCblNuUi", 378 | "outputId": "1d443da7-8b9c-44de-cabd-2380184cd9c6" 379 | }, 380 | "source": [ 381 | "!rm -r python-programming-for-linguists \n", 382 | "!git clone https://github.com/IngoKl/python-programming-for-linguists " 383 | ], 384 | "execution_count": 9, 385 | "outputs": [ 386 | { 387 | "output_type": "stream", 388 | "text": [ 389 | "Cloning into 'python-programming-for-linguists'...\n", 390 | "remote: Enumerating objects: 379, done.\u001b[K\n", 391 | "remote: Counting objects: 100% (379/379), done.\u001b[K\n", 392 | "remote: Compressing objects: 100% (278/278), done.\u001b[K\n", 393 | "remote: Total 379 (delta 211), reused 262 (delta 94), pack-reused 0\u001b[K\n", 394 | "Receiving objects: 100% (379/379), 5.17 MiB | 11.95 MiB/s, done.\n", 395 | "Resolving deltas: 100% (211/211), done.\n" 396 | ], 397 | "name": "stdout" 398 | } 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "metadata": { 404 | "colab": { 405 | "base_uri": "https://localhost:8080/", 406 | "height": 78 407 | }, 408 | "id": "8x_Ni3zRN4i-", 409 | "outputId": "2aa3e5cc-f866-47a5-e241-46b78537f0a5" 410 | }, 411 | "source": [ 412 | "# We can source the script just as we would do in R\n", 413 | "r_source = robjects.r['source']\n", 414 | "r_source('python-programming-for-linguists/2021/scripts/my_functions.R')" 415 | ], 416 | "execution_count": 10, 417 | "outputs": [ 418 | { 419 | "output_type": "execute_result", 420 | "data": { 421 | "text/html": [ 422 | "\n", 423 | " ListVector with 2 elements.\n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 431 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 440 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | "
\n", 429 | " value\n", 430 | " \n", 432 | " [RTYPES.CLOSXP]\n", 433 | "
\n", 438 | " visible\n", 439 | " \n", 441 | " [RTYPES.LGLSXP]\n", 442 | "
\n", 447 | " " 448 | ], 449 | "text/plain": [ 450 | " [RTYPES.VECSXP]\n", 451 | "R classes: ('list',)\n", 452 | "[SexpClosure, BoolSexpVector]\n", 453 | " value: \n", 454 | " [RTYPES.CLOSXP]\n", 455 | " visible: \n", 456 | " [RTYPES.LGLSXP]" 457 | ] 458 | }, 459 | "metadata": { 460 | "tags": [] 461 | }, 462 | "execution_count": 10 463 | } 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "metadata": { 469 | "id": "QqnjwojpOkm7" 470 | }, 471 | "source": [ 472 | "r_add_three = robjects.r['add_tree']\n", 473 | "r_to_lowercase = robjects.r['to_lowercase']" 474 | ], 475 | "execution_count": 11, 476 | "outputs": [] 477 | }, 478 | { 479 | "cell_type": "markdown", 480 | "metadata": { 481 | "id": "BuWXcIKqO1pB" 482 | }, 483 | "source": [ 484 | "We already know how `to_lowercase` works. Let's have a look at the internals of `add_tree` from within *Python*." 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "metadata": { 490 | "colab": { 491 | "base_uri": "https://localhost:8080/" 492 | }, 493 | "id": "yrw55Ul-OuSt", 494 | "outputId": "c95aff34-8553-4433-8915-e65be6b5cd5f" 495 | }, 496 | "source": [ 497 | "# This will show us the R code\n", 498 | "print(r_add_three.r_repr())" 499 | ], 500 | "execution_count": 12, 501 | "outputs": [ 502 | { 503 | "output_type": "stream", 504 | "text": [ 505 | "function (a, b, c) \n", 506 | "{\n", 507 | " return(a + b + c)\n", 508 | "}\n" 509 | ], 510 | "name": "stdout" 511 | } 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "metadata": { 517 | "colab": { 518 | "base_uri": "https://localhost:8080/", 519 | "height": 57 520 | }, 521 | "id": "u1NaS11gPeZV", 522 | "outputId": "056519a7-2e69-44c7-d1aa-5c9089e7abbf" 523 | }, 524 | "source": [ 525 | "r_add_three(1, 2, 3)" 526 | ], 527 | "execution_count": 13, 528 | "outputs": [ 529 | { 530 | "output_type": "execute_result", 531 | "data": { 532 | "text/html": [ 533 | "\n", 534 | " IntVector with 1 elements.\n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | "
\n", 540 | " 6\n", 541 | "
\n", 546 | " " 547 | ], 548 | "text/plain": [ 549 | " [RTYPES.INTSXP]\n", 550 | "R classes: ('integer',)\n", 551 | "[6]" 552 | ] 553 | }, 554 | "metadata": { 555 | "tags": [] 556 | }, 557 | "execution_count": 13 558 | } 559 | ] 560 | } 561 | ] 562 | } -------------------------------------------------------------------------------- /Coding_Style_Guide.md: -------------------------------------------------------------------------------- 1 | # Coding Style Guide 2 | 3 | There are not only many different ways in which one can solve a programming task, but also many ways of writing (or styling) one's code. 4 | 5 | Similar to natural languages, programming languages allow you to make stylistic decisions. This ranges from relatively simple things such as the number of indentations and the preferred way of naming variables and objects to more sophisticated questions such as using Idioms used and recognized by the community. 6 | 7 | For example, a common and Pythonic principle (well, convention) is to use the `_` (underscore) for ignored variables such as in: 8 | 9 | ```Python 10 | for _ in range(10): 11 | print('Hello World') 12 | ``` 13 | 14 | We could have used anything instead of `_`. However, as the variable is not needed, it's a convention to use an underscore. 15 | 16 | The Python community, at least many, pride themselves in writing highly readable and [*Pythonic* code](https://docs.python-guide.org/writing/style/). The two most famous guidelines are [**PEP 8** - *Style Guide for Python Code*](https://www.python.org/dev/peps/pep-0008) as well as the more poetic [**PEP 20** - *The Zen of Python*](https://www.python.org/dev/peps/pep-0020/). 17 | 18 | In addition, many companies and organizations publish their own style guides and documentation. A famously and widely applied guide that is, arguably, more practical than *PEP 8* in many regards is [Google's Python Style Guide](https://google.github.io/styleguide/pyguide.html). 19 | 20 | ## Basics 21 | 22 | Of course, this is not a complete style guide. However, I wanted to discuss some of the decisions made: 23 | 24 | * In *Colab* notebooks, two spaces are used for indentation to make the code more compact. In 'regular' Python scripts, following [PEP 8](https://www.python.org/dev/peps/pep-0008/), four spaces are the norm and preferred. 25 | * Single quotation marks (`'hello world'`) are used instead of double ones. Ultimately, it's just a preference thing. 26 | * String formatting is performed using [f-strings](https://www.python.org/dev/peps/pep-0498/). They are easy to write, easy to read, and just plain better than other options in almost any case. 27 | * While trying to stay within the 80 characters per line limit posited by *PEP 8*, this is treated as a suggestion rather than a rule. 28 | * Very basic commenting using the `#` symbol. While [docstrings](https://www.python.org/dev/peps/pep-0257/) are fantastic, I believe that they overcomplicate things for beginners in many cases. 29 | 30 | Generally speaking, for this workshop, I tried to prioritize readable code over highly efficient or beautiful code. Furthermore, in many cases, I have been using fairly explicit ways of doing things. In other cases, I have deliberately used shorthands and idioms (e.g., list comprehensions) to demonstrate them. 31 | 32 | *For example:* 33 | 34 | Let's assume we have a tokenized list of words `sentence`, and we want the same list but in lower case. The following two things both achieve the same goal: 35 | 36 | ```Python 37 | sentence = ['Hello', 'World'] 38 | lowercase = [] 39 | 40 | for word in sentence: 41 | lowercase.append(word.lower()) 42 | ``` 43 | 44 | ```Python 45 | lowercase = [w.lower() for w in sentence] 46 | ``` 47 | 48 | While the second one, a list comprehensions, arguably is more elegant and possibly also more pythonic, it is harder to understand for beginners. 49 | 50 | ## Auto-Formatting 51 | 52 | There are a number of fantastic tools available that help you with writing better, more beautiful, and cleaner code. Some commonly used auto-formatters are [`autopep8`](https://github.com/hhatto/autopep8#features), Google's [`yapf`](https://github.com/google/yapf) as well as [`black`](https://github.com/psf/black). 53 | 54 | For this workshop, I opted against using or introducing these tools to reduce complexity. Also, I made some pedagogical/didactic decisions that run against *PEP 8*. 55 | 56 | ## Type Hinting 57 | 58 | Since Python 3.5, [type hinting](https://www.python.org/dev/peps/pep-0484/) is supported in the language. Put simply, type hinting is a way of making the types (e.g., *integer* or *string*) more explicit. 59 | 60 | ```Python 61 | def hello(name: str) -> str: 62 | return f'Hello, {name}' 63 | ``` 64 | 65 | In the example above, type hins are used to clearly indicate that the function `hello` takes one string (`name`) and returns a string as well. 66 | 67 | Type hinting has both pros and cons ([Overview by Christopher Bailey](https://realpython.com/lessons/pros-and-cons-type-hints/)) and there's a lively debate on whether to use it or not. Personally, I think type hinting has its place within larger projects that can benefit from easier documentation and more explicit engineering However, for the sake of learning Python and for smaller tools, I believe it first and foremosts adds unnecessary complexity. Hence, there is no type hinting in this workshop! 68 | -------------------------------------------------------------------------------- /Command_Line_Primer.md: -------------------------------------------------------------------------------- 1 | # (Linux) Command Line Primer 2 | 3 | The command line is a text-based interface to your computer (or a computer in the cloud). It allows you to perform actions on the computer by running commands. 4 | 5 | In (*Google Colab* / *Jupyter*) notebooks, you can execute these commands by prefixing them with the `!` character. 6 | Therefore, you could, for example, run `!ls` in a notebook cell to list all files in the current folder. 7 | 8 | **Please note:** The command line is a *very* powerful and useful tool worth exploring. However, this very short primer is only going to introduce you to some very basic commands that will help you during this workshop. 9 | 10 | ## Navigating the File System 11 | 12 | When you are using the command line, you are always 'within' a folder on your file system. The folder you are in can be identified by a path, for example `/home/linguistics/exercises`. In this example, there is a folder called `exercises` which is in a folder called `linguistics` which is in a folder called `home` which is in the so-called root directory `/` of the file system. 13 | 14 | Similarly, if you `git clone` this repository, you will have, for example, this folder: `python-programming-for-linguists/2020/data/wikipedia`. 15 | 16 | If you want to know your current folder, you can run the `pwd` command on the command line. 17 | 18 | If you want to move to a different folder, you can run `cd path` (e.g., `cd python-programming-for-linguists/2020/data/`) to *change* the *directory*. You can also use `cd ..` to go up one folder. Therefore, if you are in `python-programming-for-linguists/2020/data/` and rund `cd ..`, you will end up in `python-programming-for-linguists/2020/`. 19 | 20 | If you want to know which files are in the current folder, you can run `ls`. 21 | 22 | ## Working with Files 23 | 24 | You can create an empty file in the current folder using `touch filename` (e.g., `touch exercise.txt`). 25 | 26 | If you want to have a brief look at the contents of a file, run `cat filename` (e.g., `cat exercise.txt`). 27 | 28 | In order to copy a file, you will need to run `cp filename copy_of_filename`. For example, you could run `cp exercise.txt exercises/exercise.txt` to copy the file `exercise.txt` into a folder called `exercises`. In case you want to move the file, you can use `mv` instead of `cp`. 29 | 30 | If you need to delete a file, you can run `rm filename` (e.g., `cat exercise.txt`). 31 | 32 | ## Useful Commands 33 | 34 | Find a specific string in a file 35 | 36 | `cat file.txt | grep 'search term'` 37 | 38 | Replace every instance of *word_a* with *word_b* in file `file.txt` 39 | 40 | `sed 's/word_a/word_b/g' file.txt` 41 | 42 | Display the differences between `file_a.txt` and `file_b.txt` 43 | 44 | `diff file_a.txt file_b.txt` 45 | 46 | Count the lines, words, and characters in `file.txt` 47 | 48 | `wc file.txt` or `wc -w file.txt` to just get the word count. 49 | 50 | ## Video Tutorial 51 | 52 | I have also prepared a video tutorial on using the Linux shell. This [*Shell Primer for Linguistics*](https://www.youtube.com/watch?v=6H-D6ujhMOY) is available via YouTube. 53 | 54 | ## Additional Resources 55 | 56 | [Linux Terminal Tools Tutorial by Ketan M.](https://github.com/ketancmaheshwari/lisa19) 57 | -------------------------------------------------------------------------------- /Commenting_in_Python.md: -------------------------------------------------------------------------------- 1 | # Commenting in Python 2 | 3 | A key principle in software development is **documenting code**. This is something we have not really done in this workshop besides adding some comments to the notebooks. 4 | 5 | However, especially if you are writing longer programs and scripts, adding good comments to your code is a must. This is true whether you work alone or whether there are multiple developers involved. 6 | 7 | While there are many sophisticated ways of documenting code, the most basic way is to add comments directly into your code. 8 | 9 | In Python, the two most common ways of doing this are the following: 10 | 11 | ## Single Line Comments 12 | 13 | ```python 14 | # The following line will print the string "Hello World". 15 | print('Hello World') 16 | ``` 17 | 18 | or: 19 | 20 | ```python 21 | tokenized = [...] # This is a tokenized representation of ... 22 | ``` 23 | 24 | ### Multiline Comments 25 | 26 | ```python 27 | def add(a, b): 28 | '''This function will take in two integers and 29 | return their sum.''' 30 | 31 | return a * b 32 | ``` 33 | 34 | ### Some Best Practices 35 | 36 | * Avoid comments which simply repeat what's in the code: `return a # Returns a` 37 | * Keep your comments brief and relevant (*Follow Grice's Maxims*) 38 | * Make sure that your comments reflect the current state of the code. Update the comments if necessary. 39 | * If there are guidelines, follow them! 40 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # License 2 | 3 | ## Code and Notebooks 4 | 5 | MIT License 6 | 7 | Copyright (c) 2021 Ingo Kleiber 8 | 9 | Permission is hereby granted, free of charge, to any person obtaining a copy 10 | of this software and associated documentation files (the "Software"), to deal 11 | in the Software without restriction, including without limitation the rights 12 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | copies of the Software, and to permit persons to whom the Software is 14 | furnished to do so, subject to the following conditions: 15 | 16 | The above copyright notice and this permission notice shall be included in all 17 | copies or substantial portions of the Software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | 27 | ## Slides, Videos, and Exercises 28 | 29 | This work is licensed under a [Creative Commons Attribution-ShareAlike 4.0 International License](https://creativecommons.org/licenses/by-sa/4.0/). 30 | -------------------------------------------------------------------------------- /Links_to_Resources.md: -------------------------------------------------------------------------------- 1 | # Additional Resources 2 | 3 | ## Books 4 | 5 | * [Zed A. Shaw - Learn Python 3 the Hard Way](https://learnpythonthehardway.org/python3) 6 | * [Kenneth Reitz & Tanya Schlusser - The Hitchhiker's Guide to Python](https://docs.python-guide.org) 7 | * [Tim Hall & J-P Stacey - Python 3 for Absolute Beginners](https://www.springer.com/de/book/9781430216322) 8 | * [Eric Matthes - Python Crash Course](https://nostarch.com/pythoncrashcourse2e) 9 | * [Michael Hammond - Python for Linguists](https://www.cambridge.org/core/books/python-for-linguists/84236519ADE8F28EFF77E12D277DD3E2) 10 | * [The NLTK Book](https://www.nltk.org/book/) 11 | 12 | ## Online Courses 13 | 14 | ### Python in General 15 | 16 | * [Codecademy](https://www.codecademy.com) 17 | * [mimo](https://getmimo.com) 18 | * [DataCamp](https://datacamp.com) 19 | * [Talk Python Training](https://training.talkpython.fm/) 20 | * [Django Girls](https://djangogirls.org/) 21 | * [FreeCodeCamp](https://www.freecodecamp.org/) 22 | 23 | ### NLP 24 | 25 | * [fast.ai A Code-First Introduction to Natural Language Processing](https://www.fast.ai/2019/07/08/fastai-nlp/) 26 | * [Lena Voita - NLP Course | For You](https://lena-voita.github.io/nlp_course.html) 27 | 28 | ### spaCy 29 | 30 | * [Explosion/Vincent Warmerdam - Intro to NLP with spaCy](https://www.youtube.com/watch?v=WnGPv6HnBok&list=PLBmcuObd5An559HbDr_alBnwVsGq-7uTF) 31 | * [Explosion/Ines Montani - Advanced NLP with spaCy](https://www.youtube.com/watch?v=THduWAnG97k&list=PLBmcuObd5An7CEbfoZcRVfswgB_FKz6Gb) 32 | 33 | ## Python/NLP University Classes 34 | 35 | * [Stanford NLP’s CS224n: *Natural Language Processing with Deep Learning*](http://web.stanford.edu/class/cs224n/) 36 | * [Carnegie Mellon's CS 11-747: Neural Networks for NLP](http://phontron.com/class/nn4nlp2021/index.html) 37 | * [Michigan State University's CSE 842: A Hands-on Introduction to Natural Language Processing (NLP)](https://github.com/deskool/nlp-class) 38 | 39 | ## Articles 40 | 41 | * [Top 10 Python Libraries for Natural Language Processing (NLP) in 2020](https://kleiber.me/blog/2020/08/26/top-10-python-nlp-libraries-2020/) 42 | 43 | ## Other Resources 44 | 45 | * [Podcast: *Talk Python To Me* by Michael Kennedy](https://talkpython.fm) 46 | * [PyCon](https://pycon.org/) 47 | * [Peter Norvig’s *pytudes*](https://github.com/norvig/pytudes) 48 | * [Calcomde - Great Short Video Tutorials](https://calmcode.io/) 49 | * [Download NLTK Data and Corpora](http://www.nltk.org/nltk_data/) 50 | * [What the f*ck Python!](https://github.com/satwikkansal/wtfpython) 51 | * [NLP Tutorial (building models using PyTorch)](https://github.com/graykode/nlp-tutorial) 52 | * [Meta List of Resources at education.python.org](https://education.python.org/resources/resource/list) 53 | -------------------------------------------------------------------------------- /Markdown_Primer.md: -------------------------------------------------------------------------------- 1 | # Markdown Primer 2 | 3 | Markdown is a simple [markup language](https://en.wikipedia.org/wiki/Markup_language) used for creating formatted text. 4 | For the purpose of this workshop, Markdown is interesting because it can be used to format cells in *Colab* or *Jupyter* notebooks. 5 | *GitHub* is also rendering Markdown files (usually the extension is `.md`). 6 | 7 | Maybe you have noticed that this file itself is formatted using Markdown. If you're interested, have a look at the [raw file](https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/main/Markdown_Primer.md) to see the underlying Markdown code. 8 | 9 | ## Formatting Text 10 | 11 | Here are a few example for you to try: 12 | 13 | `*Text*` 14 | 15 | *Text* 16 | 17 | `**Text**` 18 | 19 | **Text** 20 | 21 | `~Text~` 22 | 23 | ~Text~ 24 | 25 | ``` 26 | * A 27 | * B 28 | * C 29 | ``` 30 | 31 | * A 32 | * B 33 | * C 34 | 35 | `[Google Colab](https://colab.research.google.com)` 36 | 37 | [Google Colab](https://colab.research.google.com) 38 | 39 | `# Headline Level 1` 40 | # Headline Level 1 41 | 42 | `## Headline Level 2` 43 | ## Headline Level 2 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python Programming for Linguists 2 | 3 | [![Learning Path: Long](https://github.com/IngoKl/python-programming-for-linguists/blob/main/images/lp-long.svg)](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/learning-path-beginner-long.md) 4 | [![Learning Path: Short](https://github.com/IngoKl/python-programming-for-linguists/blob/main/images/lp-short.svg)](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/learning-path-beginner-short.md) 5 | [![Learning Path: Experienced](https://github.com/IngoKl/python-programming-for-linguists/blob/main/images/lp-experienced.svg)](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/learning-path-experienced.md) 6 | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IngoKl/python-programming-for-linguists/) 7 | 8 | ![Python Programming for Linguists](https://github.com/IngoKl/python-programming-for-linguists/blob/main/images/banner.png) 9 | 10 | --- 11 | 12 | The last **live session** (about three hours) has taken place on **June 11th, 2021** at **18:00 CEST** on [Twitch](https://www.twitch.tv/ingokl). 13 | 14 | **Quick Start**: Go straight to the [default learning path](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/learning-path-beginner-long.md) – *Absolute Beginner (Long)*. 15 | 16 | --- 17 | 18 | Welcome to this repository for the **Python Programming for Linguists** workshop. 19 | 20 | In this workshop, consisting of several videos and exercises, as well as a live session (recordings/videos available), you will be introduced to Python and its application within (corpus) linguistics. After a short general introduction to programming as well as Python, we will utilize Python to solve several (corpus) linguistic exercises. 21 | 22 | This workshop is specifically targeted towards people who have **no prior experience programming**. While this workshop is not intended to make you a programmer, you will gain a fundamental understanding of how programming works and how to proceed should you want to deepen your knowledge and skills. In addition, by looking at various example tasks that are commonly solved using existing software, we will try to deepen our understanding of how commonly used tools work under the hood. 23 | 24 | Please **be aware** that this workshop was specifically designed as **a first introduction to programming for non-coders and linguists** and not as a fully-fledged Python course. Therefore, we will take some shortcuts, disobey some best practices, and hide away quite a few of the underlying complexities. If you are interested in a more thorough introduction or want to deepen your already existing knowledge, please refer to the final video in which I present many great resources. Also, feel free to have a look at the [List of Additional Resources](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Links_to_Resources.md). 25 | 26 | While the materials and exercises are targeted towards beginners, they are **challenging**, and this workshop is designed as an **intense deep dive**! 27 | 28 | Please do not feel discouraged if you get stuck or if something seems too hard at first. I have provided solutions for all exercises, and you will also find lots of additional helpful resources in this repository. Also, while not required, you can prepare for this workshop by consulting other slower-paced introductory courses such as the ones listed in [this document](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Links_to_Resources.md). *You will get there!* 🚀 29 | 30 | This repository currently reflects the the second iteration of this workshop (2021) which started as part of my *Data Literacy for Linguists* class taught at the University of Cologne. Therefore, I will be reusing most of the material from the 2020 rendition. In 2022, the material has been slightly updated, especially regarding a new solutions video for Exercises 8 to 17 and a note regarding the use of [ChatGPT](https://chat.openai.com) and similar AI systems. Hence, please do not get confused about the `2020`, `2021`, and `2022` folders and simply follow the **learning path(s) provided below**. That being said, feel free to explore as much as you want! 31 | 32 | Originally, this workshop has been inspired by workshops I held at [35c3](https://events.ccc.de/congress/2018/wiki/index.php/Session:(Python)_Programming_for_Absolute_Beginners) and [36c3 a](https://events.ccc.de/congress/2019/wiki/index.php/Session:Python_Programming_for_Absolute_Beginners)/[36c3 b](https://events.ccc.de/congress/2019/wiki/index.php/Session:Introduction_to_Natural_Language_Processing). 33 | 34 | ## 1. Learning Objectives 35 | 36 | After completing this workshop, you will be able to ... 37 | 38 | * describe what programming essentially is about. 39 | * name and describe some basic programming terminology. 40 | * model simple problems in terms of data structures and basic algorithms. 41 | * write basic scripts in Python in order to solve specific problems. 42 | * utilize third-party libraries such as [NLTK](https://www.nltk.org), [spaCy](https://spacy.io), and [TextDirectory](https://github.com/IngoKl/textdirectory). 43 | * construct and apply basic regular expressions. 44 | * utilize Python for text manipulation. 45 | * utilize Python to perform concordance and frequency analysis. 46 | * automatically annotate texts (PoS, Universal Dependencies, NER) using spaCy. 47 | * scrape web data in order to build corpora (Web as Corpus) using Python. 48 | * compute basic statistics using Python. 49 | 50 | ## 2. Workshop Outline & Learning Paths 51 | 52 | This workshop is designed as a blend of **asynchronous and synchronous elements**. However, as everything will be recorded, you can also do this in a completely self-paced fashion. 53 | 54 | The general idea is that you watch a series of videos and complete/attempt a series of exercises before joining the synchronous live session hosted on *Twitch*. During this live session, I will be solving [Exercises 8 to 17](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/exercises/Exercises%208-17.pdf) while you are invited to code along and to ask questions. 55 | 56 | ### 2.1 Learning Paths 57 | 58 | To make things as straightforward as possible, I have created three learning paths for you to follow. 59 | 60 | The *Absolute Beginner (Long)* path also contains additional materials and exercises. If you are already somewhat familiar with Python, you can have a look at the *Experienced in Python* path. 61 | 62 | * [Absolute Beginner (Long)](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/learning-path-beginner-long.md) 63 | * [Absolute Beginner (Short)](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/learning-path-beginner-short.md) 64 | * [Experienced in Python](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/learning-path-experienced.md) 65 | 66 | ### 2.2 Live Sessions 67 | 68 | The las live session (about three hours) has taken place on **June 11th, 2021** at **18:00 CEST** on [Twitch](https://www.twitch.tv/ingokl). 69 | 70 | If you are interested, last year's recording (slightly edited and polished) is available on [YouTube](https://www.youtube.com/watch?v=70g9oeclNac). 71 | 72 | Also, as of December 2022, pre-recorded solutions for Exercises 8 to 17, instead of the live session(s), are available for you to watch (see below). 73 | 74 | ## 3. Coding Along 75 | 76 | I want to strongly encourage you to code along and to experiment with the exercises. The easiest way of doing this is to use *Google Colab*. In order to do this, you will need a Google Account. If you have never used *Colab* you might want to have a look at [this tutorial on YouTube](https://www.youtube.com/watch?v=JxjUEvQSFkU&list=PLG6oHk0SZfBwNq7gpK45e3YPTbfNfmKfi&index=1). 77 | 78 | If you do not want to rely on Google, you can also set up your own local development environment. For a tutorial on how to do this on Windows, have a look at the video ["Setting Up Your Development Environment (Windows)."](https://www.youtube.com/watch?v=xrXEouns3fg) 79 | 80 | The videos are intended to be paused from time to time. Do not feel forced to watch through a whole video before playing with the code 😀. 81 | 82 | ## 4. List of Materials 83 | 84 | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/IngoKl/python-programming-for-linguists/) 85 | 86 | This is a list of the available materials. I would suggest **following one of the learning paths provided above**, but of course, you are free to use the materials as you see fit. 87 | 88 | There is no fixed schedule for *updating these materials* as I am not actively teaching this workshop at the moment. While the "old" materials, e.g., from 2020, are largely kept as is, I have gone back and made some very minor changes – e.g., when I came across a typo. 89 | 90 | ### 4.1 Videos and Recordings 91 | 92 | All of these videos are currently hosted on YouTube ([Playlist](https://www.youtube.com/playlist?list=PLG6oHk0SZfBxRIegm0QvzDvmumma7grp5)). Additional *Technology Primers for Linguisics* are available via their own [YouTube playlist](https://www.youtube.com/watch?v=7EETKVp20y4&list=PLG6oHk0SZfBzNHJQC4WLIMopcfD271uxh). 93 | 94 | * [00 - Python Programming for Absolute Beginners](https://www.youtube.com/watch?v=4UnF45lniyY) 95 | * [01 - The Pizza Problem](https://www.youtube.com/watch?v=g9tOyVI5B3E) 96 | * [02 - Working with Files, Texts, and Regular Expressions](https://www.youtube.com/watch?v=y37_JvSY-GM) 97 | * [03 - Python for (Corpus) Linguists / Exercises 8 to 17 (2022 Recording)](https://youtu.be/DLyVL0mEISU) ([Old 2020 Recording](https://www.youtube.com/watch?v=70g9oeclNac)) 98 | * [04 - Summary and Resources](https://www.youtube.com/watch?v=ajKqESDmrKc) 99 | * [05 - Setting Up Your Development Environment (Windows)](https://www.youtube.com/watch?v=xrXEouns3fg) (Alternative to using *Google Colab*. Please watch [video 00](https://www.youtube.com/watch?v=4UnF45lniyY) first in any case!) 100 | * [06 - Getting Started with Google Colab](https://www.youtube.com/watch?v=JxjUEvQSFkU) 101 | 102 | ### 4.2 Exercises 103 | 104 | This workshop, next to the videos and livestream, has 17 main exercises as well as number of additional ones. Solutions to these exercises are available in the form of notebooks. 105 | 106 | * [Exercises 1 to 3](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%201-3.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_1_3.ipynb)) 107 | * [Exercises 4 and 5](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%204-5.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_4_5.ipynb)) 108 | * [Exercises 6 and 7](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Exercises%206-7.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/exercises/Solutions_Exercises_6_7.ipynb)) 109 | * [Exercises 8 to 17](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/exercises/Exercises%208-17.pdf) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2022/exercises/Solutions_Exercises_8_17.ipynb)) 110 | * [Additional Exercise: Regular Expressions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/exercises/Additional_Exercises_RegEx.ipynb) ([Exercise Video](https://www.youtube.com/watch?v=GGEveroG3Fg)) 111 | * [Additional Exercise: Frequency Distribution](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/exercises/Additional_Exercises_Frequency_Distribution.ipynb) ([Solutions](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2021/exercises/Additional_Exercises_Solutions_Frequency_Distribution.ipynb)) 112 | 113 | Please note that for each exercise, you will find **solutions in this repository**. Don't feel bad if you cannot immediately solve the exercises - the solutions are there to help you. Of course, feel free to take apart these suggested solutions and play with them. 114 | 115 | ### 4.3 Slides 116 | 117 | All of the slides (in both `.pptx` and `.pdf`) are available as well. See [2020](https://github.com/IngoKl/python-programming-for-linguists/tree/main/2020/slides), [2021](https://github.com/IngoKl/python-programming-for-linguists/tree/main/2021/slides), and [2022](https://github.com/IngoKl/python-programming-for-linguists/tree/main/2022/slides). 118 | 119 | ### 4.4 Bonus Notebooks 120 | 121 | Aside from the main material, there are also a few **advanced bonus notebooks** in this repository for you to explore. Have a look at them to see more advanced and/or alternative solutions to some of the problems discussed in the workshop. 122 | 123 | * [Pizza Problem: Dataclass Solution](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Bonus%20Notebooks/Pizza_Problem_Dataclass_Solution.ipynb) 124 | * [Understanding Classes and Objects](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/notebooks/Bonus_Understanding_Classes_and_Objects.ipynb) 125 | * [Working with R in Python](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Bonus%20Notebooks/Working_with_R_in_Python.ipynb) 126 | 127 | ### 4.5 Helpful Additional Material and Primers 128 | 129 | * [Command Line Primer](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Command_Line_Primer.md) 130 | * [Markdown Primer](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Markdown_Primer.md) 131 | * [Commenting in Python](https://github.com/IngoKl/python-programming-for-linguists/blob/main/Commenting_in_Python.md) 132 | * [Video: A RegEx Primer for Linguistics](https://www.youtube.com/watch?v=p7-QkwOU9RY) 133 | * [Video: A Git Primer for Linguistics](https://www.youtube.com/watch?v=7EETKVp20y4) 134 | * [Video: A Shell Primer for Linguistics](https://www.youtube.com/watch?v=6H-D6ujhMOY) 135 | * [List of Additional Resources](https://github.com/IngoKl/python-programming-for-linguists/tree/main/Links_to_Resources.md) 136 | 137 | ## 5. Python Version and Coding Style 138 | 139 | This workshop is based on **modern Python** and requires a version of Python >= 3.6. All of the code, as well as the used external libraries, should be compatible with everything up to Python 3.9 as well. If you are interested, also have a look at the [*Coding Style Guide*](https://github.com/IngoKl/python-programming-for-linguists/tree/main/Coding_Style_Guide.md) for this workshop in which I discuss how most of the code is styled and why. 140 | 141 | ## 6. License 142 | 143 | You are (relatively) free to use all of these materials as you like. 144 | 145 | * The code (notebooks and scripts) is licensed under the [MIT License](https://github.com/IngoKl/python-programming-for-linguists/blob/main/LICENSE.md). 146 | * The slides, videos, and exercises are licensed under a [CC BY-SA 4.0](https://creativecommons.org/licenses/by-sa/4.0) license. 147 | 148 | ### A Note on the HUM19UK Corpus by Fransina Stradling et al. 149 | 150 | In some exercises, this workshop relies on the wonderful [HUM19UK corpus](https://varieng.helsinki.fi/CoRD/corpora/HUM19UK/index.html) (Huddersfield, Utrecht, Middelburg Corpus of 19th Century Fiction) compiled by Fransina Stradling, Brian Walker, Dan McIntyre, Elliot Land, Hazel Price, and Michael Burke. 151 | 152 | Unfortunately, the corpus website (linguisticsathuddersfield.com) cannot be reached anymore, and access to the corpus has become harder for the moment. Hence, for now, and in the spirit of their original licensing, I have made the data available through one of my servers and updated the [download script](https://github.com/IngoKl/python-programming-for-linguists/blob/main/2020/data/download_hum19uk.sh) accordingly. 153 | -------------------------------------------------------------------------------- /images/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/IngoKl/python-programming-for-linguists/9390bf31404539eb96c7cee407c8586aba273364/images/banner.png -------------------------------------------------------------------------------- /images/lp-experienced.svg: -------------------------------------------------------------------------------- 1 | Learning Path: ExperiencedLearning PathExperienced -------------------------------------------------------------------------------- /images/lp-long.svg: -------------------------------------------------------------------------------- 1 | Learning Path: LongLearning PathLong -------------------------------------------------------------------------------- /images/lp-short.svg: -------------------------------------------------------------------------------- 1 | Learning Path: ShortLearning PathShort --------------------------------------------------------------------------------