├── 01_ed8
    ├── apply_ed8.R
    └── ed8.yml
├── 02_neuralnet
    ├── apply_neuralnet.R
    └── models
    │   ├── anger
    │   ├── disgust
    │   ├── enthusiasm
    │   ├── fear
    │   ├── hope
    │   ├── joy
    │   ├── pride
    │   └── sadness
├── 03_electra
    ├── apply_electra.ipynb
    ├── helper
    │   ├── __pycache__
    │   │   ├── inferencing.cpython-37.pyc
    │   │   ├── inferencing.cpython-39.pyc
    │   │   ├── training.cpython-37.pyc
    │   │   └── training.cpython-39.pyc
    │   ├── data_preparation.py
    │   ├── inferencing.py
    │   └── training.py
    ├── models
    │   └── final
    │   │   └── german-nlp-group
    │   │       └── electra-base-german-uncased
    │   │           └── config.json
    └── requirements.txt
└── README.md


/01_ed8/apply_ed8.R:
--------------------------------------------------------------------------------
 1 | ##################################################################
 2 | ##################################################################
 3 | ## Widmann & Wich: Creating and Comparing Dictionary, Word Embedding, and Transformer-based 
 4 | ## Models to Measure Discrete Emotions in German Political Text
 5 | ## Political Analysis
 6 | ## widmann@ps.au.dk
 7 | ##################################################################
 8 | ##################################################################
 9 | 
10 | 
11 | #### Applying ed8 dictionary
12 | 
13 | #### FUNCTION ed8 #################################
14 | 
15 | # First, load quanteda package
16 | library(quanteda)
17 | 
18 | 
19 | #  Load in dictionary
20 | ed8 <- dictionary(file = "./ed8.yml",
21 |                   format = "YAML")
22 | 
23 | # Create the function
24 | get_ed8_emotions <- function(data){
25 |   #Create a corpus from your data frame
26 |   corp <- corpus(data)
27 |   
28 |   #Tokenize corpus and pre-process (remove punctuations, numbers, and urls)
29 |   toks <- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE, remove_url = TRUE)
30 |   
31 |   #Create DFM just to measure number of terms before removing stopwords
32 |   terms_dfm <- dfm(toks)
33 |   
34 |   #Create bigram-compounds to include negation control
35 |   toks_neg_bigram <- tokens_compound(toks, pattern = phrase("nicht *"))
36 |   toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("nichts *"))
37 |   toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("kein *"))
38 |   toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("keine *"))
39 |   toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("keinen *"))
40 |   
41 |   #Turn tokens into DFM, remove stopwords
42 |   emo_dfm <- dfm(toks_neg_bigram, remove = stopwords("de"))
43 |   
44 |   #Apply dictionary
45 |   dict_dfm_results <- dfm_lookup(emo_dfm,ed8)
46 |   
47 |   #Convert results back to data frame
48 |   results_df <- cbind(data, convert(dict_dfm_results, to = 'data.frame'))
49 |   
50 |   #Assign length to each documents
51 |   results_df$terms_raw <- ntoken(terms_dfm)
52 |   results_df$terms <- ntoken(emo_dfm)
53 |   
54 |   return(results_df)
55 | }
56 | 
57 | 
58 | # Now you can use the function on your data; simply enter a data frame with a column called "text" including the text data
59 | results <- get_ed8_emotions(data)
60 | 
61 | # Finally, you can create normalized emotional scores by dividing the ed8-scores by document length
62 | results$anger.norm <- results$ed8.ANGER / results$terms
63 | results$fear.norm <- results$ed8.FEAR / results$terms
64 | results$disgust.norm <- results$ed8.DISGUST / results$terms
65 | results$sadness.norm <- results$ed8.SADNESS / results$terms
66 | results$joy.norm <- results$ed8.JOY / results$terms
67 | results$enthusiasm.norm <- results$ed8.ENTHUSIASM / results$terms
68 | results$pride.norm <- results$ed8.PRIDE / results$terms
69 | results$hope.norm <- results$ed8.HOPE / results$terms
70 | 


--------------------------------------------------------------------------------
/02_neuralnet/apply_neuralnet.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ##################################################################
 3 | ##################################################################
 4 | ## Widmann & Wich: Creating and Comparing Dictionary, Word Embedding, and Transformer-based 
 5 | ## Models to Measure Discrete Emotions in German Political Text
 6 | ## Political Analysis
 7 | ## widmann@ps.au.dk
 8 | ##################################################################
 9 | ##################################################################
10 | 
11 | #### Applying Neural Network Classifiers based on Locally Trained German Word Embeddings
12 | 
13 | # Load necessary packages
14 | library(quanteda)
15 | library(corpus)
16 | library(keras)
17 | library(tidytext)
18 | 
19 | # Set working directory
20 | setwd("./neuralnet")
21 | 
22 | 
23 | # First, you need to turn your text into sentences
24 | data <- data %>% 
25 |   unnest_tokens(sentences, text, "sentences")
26 | 
27 | # Now, you can turn your text documents in a corpus
28 | corp <- corpus(data$sentences)
29 | 
30 | # Create a document feature matrix and conduct pre-processing
31 | text_dfm <- dfm(corp, remove=stopwords("german"), verbose=TRUE, tolower = TRUE)
32 | 
33 | # Stemming
34 | text_dfm <- dfm_wordstem(text_dfm, language = "german")
35 | 
36 | # Now, we will convert the word embeddings into a data frame 
37 | # and match the features from each document with their corresponding embeddings
38 | 
39 | #F irst, we load the locally trained word embeddings into R
40 | w2v <- readr::read_delim("./vec_ed_preprocessed.txt", 
41 |                          skip=1, delim=" ", quote="",
42 |                          col_names=c("word", paste0("V", 1:100)))
43 | 
44 | # Stem the terms included in the embeddings to increase matches
45 | w2v$word <- text_tokens(w2v$word, stemmer = "de")
46 | 
47 | # creating new feature matrix for embeddings
48 | embed <- matrix(NA, nrow=ndoc(text_dfm), ncol=100)
49 | for (i in 1:ndoc(cgdfm)){
50 |   if (i %% 100 == 0) message(i, '/', ndoc(text_dfm))
51 |   # extract word counts
52 |   vec <- as.numeric(text_dfm[i,])
53 |   # keep words with counts of 1 or more
54 |   doc_words <- featnames(text_dfm)[vec>0]
55 |   # extract embeddings for those words
56 |   embed_vec <- w2v[w2v$word %in% doc_words, 2:101]
57 |   # aggregate from word- to document-level embeddings by taking AVG
58 |   embed[i,] <- colMeans(embed_vec, na.rm=TRUE)
59 |   # if no words in embeddings, simply set to 0
60 |   if (nrow(embed_vec)==0) embed[i,] <- 0
61 | }
62 | 
63 | # After you created the sentence embeddings, you can apply the trained machine learning models for each emotion
64 | # The machine learning models are provided in the folder "./neuralnet/models"
65 | # for example, anger:
66 | 
67 | model <- load_model_hdf5("./models/keras_anger", custom_objects = NULL, compile = TRUE)
68 | wb.anger <- model %>% predict_classes(embed)
69 | data <- cbind(data, wb.anger)


--------------------------------------------------------------------------------
/02_neuralnet/models/anger:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/anger


--------------------------------------------------------------------------------
/02_neuralnet/models/disgust:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/disgust


--------------------------------------------------------------------------------
/02_neuralnet/models/enthusiasm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/enthusiasm


--------------------------------------------------------------------------------
/02_neuralnet/models/fear:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/fear


--------------------------------------------------------------------------------
/02_neuralnet/models/hope:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/hope


--------------------------------------------------------------------------------
/02_neuralnet/models/joy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/joy


--------------------------------------------------------------------------------
/02_neuralnet/models/pride:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/pride


--------------------------------------------------------------------------------
/02_neuralnet/models/sadness:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/sadness


--------------------------------------------------------------------------------
/03_electra/apply_electra.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"code","execution_count":null,"id":"1dcf7613","metadata":{"id":"1dcf7613","outputId":"35148b94-b9cc-4ae0-d8a2-38f13ee2cde3"},"outputs":[{"name":"stdout","output_type":"stream","text":["/Volumes/GoogleDrive/My Drive/Work/Tools/3x8emotions/03_electra\n"]}],"source":["# Set working directory to the folder 'electra' which is located in the replication folder\n","%cd /03_electra"]},{"cell_type":"markdown","id":"ce8c32d1-2b11-47f5-af78-63a5a18d13aa","metadata":{"id":"ce8c32d1-2b11-47f5-af78-63a5a18d13aa"},"source":["To install all required packages listed in requirements.txt you can simply use the following pip-command: pip install -r requirements.txt"]},{"cell_type":"code","execution_count":null,"id":"cf55407a","metadata":{"id":"cf55407a","outputId":"c359bcf2-6889-444f-a38b-d18813f984be"},"outputs":[{"ename":"ModuleNotFoundError","evalue":"No module named 'transformers'","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)","\u001b[0;32m<ipython-input-2-5db26cc892cd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtransformers\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'transformers'"]}],"source":["import transformers\n","import pandas as pd"]},{"cell_type":"markdown","id":"41c8323e-de7d-4f26-850a-ff937c6ee266","metadata":{"id":"41c8323e-de7d-4f26-850a-ff937c6ee266"},"source":["# Inferecing New Data"]},{"cell_type":"code","execution_count":null,"id":"61310d76","metadata":{"id":"61310d76","outputId":"f0ab6327-06b0-47cb-e757-d396b7f0b03a"},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>Unnamed: 0</th>\n","      <th>TextID</th>\n","      <th>Text</th>\n","      <th>Answer.1</th>\n","      <th>Answer.2</th>\n","      <th>Answer.3</th>\n","      <th>Answer.4</th>\n","      <th>h_anger</th>\n","      <th>h_fear</th>\n","      <th>h_disgust</th>\n","      <th>...</th>\n","      <th>hf_hope</th>\n","      <th>df_hope</th>\n","      <th>wb.anger</th>\n","      <th>wb.fear</th>\n","      <th>wb.disgust</th>\n","      <th>wb.sadness</th>\n","      <th>wb.joy</th>\n","      <th>wb.enthusiasm</th>\n","      <th>wb.pride</th>\n","      <th>wb.hope</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>14</td>\n","      <td>2019127_0016</td>\n","      <td>sanktionen sind immer die schlechteste option ...</td>\n","      <td>Ärger</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>2</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>20</td>\n","      <td>2019127_0022</td>\n","      <td>fremdenfeindlichkeit rassismus hass und ressen...</td>\n","      <td>Ärger</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>3</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>1</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>25</td>\n","      <td>2019127_0026</td>\n","      <td>das muss auch die ehemalige weinkönigin verstehen</td>\n","      <td>Keine Emotion</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>28</td>\n","      <td>2019127_0029</td>\n","      <td>deshalb die linke wählen zb am 28</td>\n","      <td>Keine Emotion</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>30</td>\n","      <td>2019127_0031</td>\n","      <td>die große koalition bringt nun einen antrag fü...</td>\n","      <td>Keine Emotion</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>985</th>\n","      <td>11003</td>\n","      <td>2019127_9906</td>\n","      <td>die betroffenen haben heute eine ähnliche lebe...</td>\n","      <td>Stolz</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>1</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>986</th>\n","      <td>11008</td>\n","      <td>2019127_9911</td>\n","      <td>da ist für die linke ganz klar durch kluge dip...</td>\n","      <td>Hoffnung</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>1</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>987</th>\n","      <td>11016</td>\n","      <td>2019127_9918</td>\n","      <td>da haben sie mich leider enttäuscht</td>\n","      <td>Ärger</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>2</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>988</th>\n","      <td>11027</td>\n","      <td>2019127_9928</td>\n","      <td>ich will es ganz deutlich sagen das gesamte ve...</td>\n","      <td>Ärger</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>5</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","    <tr>\n","      <th>989</th>\n","      <td>11086</td>\n","      <td>2019127_9981</td>\n","      <td>union und spd hatten den gesetzentwurf aber im...</td>\n","      <td>Ärger</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>NaN</td>\n","      <td>4</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>...</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>1</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","      <td>0</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>990 rows × 86 columns</p>\n","</div>"],"text/plain":["     Unnamed: 0        TextID  \\\n","0            14  2019127_0016   \n","1            20  2019127_0022   \n","2            25  2019127_0026   \n","3            28  2019127_0029   \n","4            30  2019127_0031   \n","..          ...           ...   \n","985       11003  2019127_9906   \n","986       11008  2019127_9911   \n","987       11016  2019127_9918   \n","988       11027  2019127_9928   \n","989       11086  2019127_9981   \n","\n","                                                  Text       Answer.1  \\\n","0    sanktionen sind immer die schlechteste option ...          Ärger   \n","1    fremdenfeindlichkeit rassismus hass und ressen...          Ärger   \n","2    das muss auch die ehemalige weinkönigin verstehen  Keine Emotion   \n","3                    deshalb die linke wählen zb am 28  Keine Emotion   \n","4    die große koalition bringt nun einen antrag fü...  Keine Emotion   \n","..                                                 ...            ...   \n","985  die betroffenen haben heute eine ähnliche lebe...          Stolz   \n","986  da ist für die linke ganz klar durch kluge dip...       Hoffnung   \n","987                da haben sie mich leider enttäuscht          Ärger   \n","988  ich will es ganz deutlich sagen das gesamte ve...          Ärger   \n","989  union und spd hatten den gesetzentwurf aber im...          Ärger   \n","\n","    Answer.2 Answer.3  Answer.4  h_anger  h_fear  h_disgust  ...  hf_hope  \\\n","0        NaN      NaN       NaN        2       0          0  ...        0   \n","1        NaN      NaN       NaN        3       0          1  ...        0   \n","2        NaN      NaN       NaN        1       0          0  ...        0   \n","3        NaN      NaN       NaN        0       0          0  ...        1   \n","4        NaN      NaN       NaN        0       0          0  ...        0   \n","..       ...      ...       ...      ...     ...        ...  ...      ...   \n","985      NaN      NaN       NaN        0       0          0  ...        1   \n","986      NaN      NaN       NaN        0       0          0  ...        1   \n","987      NaN      NaN       NaN        2       0          0  ...        0   \n","988      NaN      NaN       NaN        5       0          0  ...        0   \n","989      NaN      NaN       NaN        4       0          0  ...        0   \n","\n","     df_hope  wb.anger  wb.fear  wb.disgust  wb.sadness  wb.joy wb.enthusiasm  \\\n","0          0         1        1           0           0       0             0   \n","1          0         1        1           1           0       0             0   \n","2          0         0        0           0           0       0             0   \n","3          0         0        0           0           0       0             0   \n","4          0         0        0           0           0       0             0   \n","..       ...       ...      ...         ...         ...     ...           ...   \n","985        1         0        0           0           0       0             0   \n","986        0         0        0           0           0       0             1   \n","987        0         0        0           0           1       0             0   \n","988        0         1        0           0           0       0             0   \n","989        0         1        0           0           1       0             0   \n","\n","    wb.pride wb.hope  \n","0          0       0  \n","1          0       0  \n","2          0       0  \n","3          0       0  \n","4          0       0  \n","..       ...     ...  \n","985        0       0  \n","986        1       0  \n","987        0       0  \n","988        0       0  \n","989        0       0  \n","\n","[990 rows x 86 columns]"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["# Load in your data (e.g. as a csv file)\n","df = pd.read_csv('./data.csv')  \n","documents = list(df.text) "]},{"cell_type":"code","execution_count":null,"id":"d5052e91","metadata":{"id":"d5052e91"},"outputs":[],"source":["from helper.inferencing import Inferencer"]},{"cell_type":"code","execution_count":null,"id":"615ff91d","metadata":{"id":"615ff91d"},"outputs":[],"source":["# Predicting\n","predictor = Inferencer()\n","df = predictor.predict_dataframe(documents)"]},{"cell_type":"code","execution_count":null,"id":"85b19294","metadata":{"id":"85b19294","outputId":"b201074e-51d3-4372-df84-8b043e163a0c"},"outputs":[{"data":{"text/html":["<div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>text</th>\n","      <th>anger</th>\n","      <th>fear</th>\n","      <th>disgust</th>\n","      <th>sadness</th>\n","      <th>joy</th>\n","      <th>enthusiasm</th>\n","      <th>pride</th>\n","      <th>hope</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>sanktionen sind immer die schlechteste option ...</td>\n","      <td>1.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>fremdenfeindlichkeit rassismus hass und ressen...</td>\n","      <td>1.0</td>\n","      <td>1.0</td>\n","      <td>1.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>das muss auch die ehemalige weinkönigin verstehen</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>deshalb die linke wählen zb am 28</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>die große koalition bringt nun einen antrag fü...</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>...</th>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","      <td>...</td>\n","    </tr>\n","    <tr>\n","      <th>985</th>\n","      <td>die betroffenen haben heute eine ähnliche lebe...</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>986</th>\n","      <td>da ist für die linke ganz klar durch kluge dip...</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>987</th>\n","      <td>da haben sie mich leider enttäuscht</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>988</th>\n","      <td>ich will es ganz deutlich sagen das gesamte ve...</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","    </tr>\n","    <tr>\n","      <th>989</th>\n","      <td>union und spd hatten den gesetzentwurf aber im...</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>1.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","      <td>0.0</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>990 rows × 9 columns</p>\n","</div>"],"text/plain":["                                                  text  anger  fear  disgust  \\\n","0    sanktionen sind immer die schlechteste option ...    1.0   1.0      0.0   \n","1    fremdenfeindlichkeit rassismus hass und ressen...    1.0   1.0      1.0   \n","2    das muss auch die ehemalige weinkönigin verstehen    0.0   0.0      0.0   \n","3                    deshalb die linke wählen zb am 28    0.0   0.0      0.0   \n","4    die große koalition bringt nun einen antrag fü...    0.0   0.0      0.0   \n","..                                                 ...    ...   ...      ...   \n","985  die betroffenen haben heute eine ähnliche lebe...    0.0   0.0      0.0   \n","986  da ist für die linke ganz klar durch kluge dip...    0.0   0.0      0.0   \n","987                da haben sie mich leider enttäuscht    1.0   0.0      0.0   \n","988  ich will es ganz deutlich sagen das gesamte ve...    1.0   0.0      0.0   \n","989  union und spd hatten den gesetzentwurf aber im...    1.0   0.0      0.0   \n","\n","     sadness  joy  enthusiasm  pride  hope  \n","0        0.0  0.0         0.0    0.0   0.0  \n","1        1.0  0.0         0.0    0.0   0.0  \n","2        0.0  0.0         0.0    0.0   0.0  \n","3        0.0  0.0         0.0    0.0   0.0  \n","4        0.0  1.0         0.0    1.0   0.0  \n","..       ...  ...         ...    ...   ...  \n","985      0.0  1.0         0.0    1.0   0.0  \n","986      0.0  0.0         1.0    1.0   0.0  \n","987      1.0  0.0         0.0    0.0   0.0  \n","988      0.0  0.0         0.0    0.0   0.0  \n","989      1.0  0.0         0.0    0.0   0.0  \n","\n","[990 rows x 9 columns]"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["# Show results\n","df"]},{"cell_type":"code","execution_count":null,"id":"eac44d08-2612-4699-a06d-ee78e77359d9","metadata":{"id":"eac44d08-2612-4699-a06d-ee78e77359d9"},"outputs":[],"source":["# Save results, e.g. as .csv file \n","df.to_csv('./electra_results.csv')"]}],"metadata":{"environment":{"kernel":"python3","name":"common-cu110.m87","type":"gcloud","uri":"gcr.io/deeplearning-platform-release/base-cu110:m87"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.8"},"colab":{"name":"apply_electra.ipynb","provenance":[],"collapsed_sections":[]}},"nbformat":4,"nbformat_minor":5}


--------------------------------------------------------------------------------
/03_electra/helper/__pycache__/inferencing.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/03_electra/helper/__pycache__/inferencing.cpython-37.pyc


--------------------------------------------------------------------------------
/03_electra/helper/__pycache__/inferencing.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/03_electra/helper/__pycache__/inferencing.cpython-39.pyc


--------------------------------------------------------------------------------
/03_electra/helper/__pycache__/training.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/03_electra/helper/__pycache__/training.cpython-37.pyc


--------------------------------------------------------------------------------
/03_electra/helper/__pycache__/training.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/03_electra/helper/__pycache__/training.cpython-39.pyc


--------------------------------------------------------------------------------
/03_electra/helper/data_preparation.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from pathlib import Path
  3 | 
  4 | import pandas as pd
  5 | import torch
  6 | from datasets import load_dataset
  7 | from sklearn.model_selection import train_test_split
  8 | from torch.utils.data import (
  9 |     DataLoader,
 10 |     Dataset,
 11 |     RandomSampler,
 12 |     SequentialSampler,
 13 |     random_split,
 14 | )
 15 | 
 16 | 
 17 | class HateDataset(Dataset):
 18 |     def __init__(self, encodings, labels):
 19 |         self.encodings = encodings
 20 |         self.labels = labels
 21 | 
 22 |     def __getitem__(self, idx):
 23 |         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 24 |         item["labels"] = torch.tensor(self.labels[idx])
 25 |         return item
 26 | 
 27 |     def __len__(self):
 28 |         return len(self.labels)
 29 | 
 30 | def getTempFiles(
 31 |     path_train,
 32 |     path_test,
 33 |     path_tmp,
 34 |     path_validation=None,
 35 |     frac_train=1.0,
 36 |     frac_val=0.1,
 37 |     seed=123,
 38 | ):
 39 |     Path(path_tmp).mkdir(parents=True, exist_ok=True)
 40 |     path_tmp_train = Path(path_tmp) / "train.csv"
 41 |     path_tmp_validation = Path(path_tmp) / "validation.csv"
 42 |     path_tmp_test = Path(path_tmp) / "test.csv"
 43 | 
 44 |     df = pd.read_csv(path_train, encoding="utf-8", sep="\t")
 45 |     df = df.sample(frac=frac_train, random_state=seed)
 46 |     if path_validation is None:
 47 |         df_train, df_validation = train_test_split(
 48 |             df, test_size=frac_val, random_state=seed
 49 |         )
 50 |     else:
 51 |         df_train = df
 52 |         df_validation = pd.read_csv(path_validation, encoding="utf-8", sep="\t")
 53 |         df_validation = df_validation.sample(
 54 |             n=int(frac_val * len(df_train)), random_state=seed
 55 |         )
 56 | 
 57 |     df_test = pd.read_csv(path_test, encoding="utf-8", sep="\t")
 58 | 
 59 |     df_train.to_csv(path_tmp_train, encoding="utf-8", sep="\t", index=False)
 60 |     df_validation.to_csv(path_tmp_validation, encoding="utf-8", sep="\t", index=False)
 61 |     df_test.to_csv(path_tmp_test, encoding="utf-8", sep="\t", index=False)
 62 | 
 63 |     print(len(df_train), len(df_validation), len(df_test))
 64 |     return path_tmp_train, path_tmp_validation, path_tmp_test
 65 | 
 66 | 
 67 | def getHateDatasets(data_params, selected_dataset, tokenizer):
 68 |     label_name = data_params[selected_dataset]["label"]
 69 |     text_name = data_params[selected_dataset]["text"]
 70 |     
 71 |     def preprocess(row):
 72 |         # preprocess text
 73 |         row["text"] = re.sub(r"\|lbr\||\|LBR\||\|AMP\||&gt;|&amp;", " ", row["text"])
 74 |         row["text"] = re.sub(r"(^|\s)@[A-Za-z0-9_-]*", " ", row["text"])
 75 |         # convert string label to integer
 76 |         # mapping is stored in model_params
 77 |         selected = data_params[selected_dataset]
 78 |         row[selected["label"]] = selected["mapping"][row[selected["label"]]]
 79 |         return row
 80 |     
 81 |     #print(data_params[selected_dataset]["train"])
 82 |     #print(data_params[selected_dataset]["validation"])
 83 |     #print(data_params[selected_dataset]["test"])
 84 |     # load data
 85 |     dataset = load_dataset(
 86 |         "csv",
 87 |         data_files={
 88 |             "train": data_params[selected_dataset]["train"],
 89 |             "validation": data_params[selected_dataset]["validation"],
 90 |             "test": data_params[selected_dataset]["test"],
 91 |         },
 92 |         delimiter="\t",
 93 |     )
 94 | 
 95 |     # preprocess data
 96 |     dataset["train"] = dataset["train"].map(preprocess)
 97 |     dataset["validation"] = dataset["validation"].map(preprocess)
 98 |     dataset["test"] = dataset["test"].map(preprocess)
 99 | 
100 |     # tokenize data
101 |     train_encodings = tokenizer(
102 |         dataset["train"][text_name], truncation=True, padding=True
103 |     )
104 |     val_encodings = tokenizer(
105 |         dataset["validation"][text_name], truncation=True, padding=True
106 |     )
107 |     test_encodings = tokenizer(
108 |         dataset["test"][text_name], truncation=True, padding=True
109 |     )
110 | 
111 |     train_dataset = HateDataset(train_encodings, dataset["train"][label_name])
112 |     val_dataset = HateDataset(val_encodings, dataset["validation"][label_name])
113 |     test_dataset = HateDataset(test_encodings, dataset["test"][label_name])
114 | 
115 |     return train_dataset, val_dataset, test_dataset
116 | 
117 | 
118 | def getHateDataLoaders(data_params, selected_dataset, tokenizer, batch_size=128):
119 |     train_dataset, val_dataset, test_dataset = getHateDatasets(
120 |         data_params, selected_dataset, tokenizer
121 |     )
122 |     train_loader = DataLoader(
123 |         dataset=train_dataset,
124 |         batch_size=batch_size,
125 |         sampler=SequentialSampler(train_dataset),
126 |         num_workers=8,
127 |     )
128 |     val_loader = DataLoader(
129 |         dataset=val_dataset,
130 |         batch_size=batch_size,
131 |         sampler=SequentialSampler(val_dataset),
132 |         num_workers=8,
133 |     )
134 |     test_loader = DataLoader(
135 |         dataset=test_dataset,
136 |         batch_size=batch_size,
137 |         sampler=SequentialSampler(test_dataset),
138 |         num_workers=8,
139 |     )
140 | 
141 |     return train_loader, val_loader, test_loader
142 | 
143 | 


--------------------------------------------------------------------------------
/03_electra/helper/inferencing.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import torch
 4 | from transformers import (
 5 |     AutoModel,
 6 |     AutoModelForSequenceClassification,
 7 |     AutoTokenizer,
 8 |     Trainer,
 9 |     TrainingArguments,
10 |     set_seed,
11 | )
12 | 
13 | import helper.training as tr
14 | 
15 | 
16 | class Inferencer:
17 |     def __init__(self):
18 |         self.emotions = [
19 |             "anger",
20 |             "fear",
21 |             "disgust",
22 |             "sadness",
23 |             "joy",
24 |             "enthusiasm",
25 |             "pride",
26 |             "hope",
27 |         ]
28 |         self.MODEL_NAME = "german-nlp-group/electra-base-german-uncased"
29 |         self.DIR_TRAINED_MODEL = "./models/final"
30 |         self.SEED = 7
31 |         set_seed(self.SEED)
32 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
33 | 
34 |         self.model = AutoModelForSequenceClassification.from_pretrained(
35 |             f"{self.DIR_TRAINED_MODEL}/{self.MODEL_NAME}", num_labels=8
36 |         ).to(device=self.device)
37 |         self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL_NAME)
38 | 
39 |     def predict(self, x):
40 |         val = []
41 |         for record in x:
42 |             # tokenize document
43 |             inputs = self.tokenizer(
44 |                 record, truncation=True, padding=True, return_tensors="pt"
45 |             )
46 |             inputs = inputs.to(device=self.device)
47 |             # inference
48 |             outputs = self.model(**inputs)
49 |             logits = outputs.logits
50 |             prediction = logits.sigmoid()
51 |             prediction[prediction >= 0.5] = 1
52 |             prediction[prediction < 0.5] = 0
53 |             prediction = prediction.detach().cpu().numpy()
54 |             val.append(prediction[0])
55 |         return np.array(val)
56 | 
57 |     def predict_dataframe(self, x):
58 |         predictions = self.predict(x)
59 |         list_for_df = []
60 |         for i in range(len(x)):
61 |             row = [*[x[i]], *predictions[i]]
62 |             list_for_df.append(row)
63 |         columns = ["text"] + self.emotions
64 |         return pd.DataFrame(list_for_df, columns=columns)
65 | 


--------------------------------------------------------------------------------
/03_electra/helper/training.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | from sklearn import metrics
  4 | from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score
  5 | from transformers import Trainer, TrainingArguments
  6 | 
  7 | class EmotionDataset(torch.utils.data.Dataset):
  8 |     def __init__(self, encodings, labels):
  9 |         self.encodings = encodings
 10 |         self.labels = labels
 11 | 
 12 |     def __getitem__(self, idx):
 13 |         item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
 14 |         item['labels'] = torch.tensor(self.labels[idx])
 15 |         return item
 16 | 
 17 |     def __len__(self):
 18 |         return len(self.labels)
 19 | 
 20 | class MultilabelTrainer(Trainer):
 21 |     def compute_loss(self, model, inputs, return_outputs=False):
 22 |         labels = inputs.pop("labels")
 23 |         outputs = model(**inputs)
 24 |         logits = outputs.logits
 25 |         loss_fct = torch.nn.BCEWithLogitsLoss()
 26 |         loss = loss_fct(logits.view(-1, self.model.config.num_labels), 
 27 |                         labels.float().view(-1, self.model.config.num_labels))
 28 |         return (loss, outputs) if return_outputs else loss
 29 |     
 30 |     
 31 | def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True): 
 32 |     y_pred = torch.from_numpy(y_pred)
 33 |     y_true = torch.from_numpy(y_true)
 34 |     if sigmoid:
 35 |         y_pred = y_pred.sigmoid()
 36 |     return ((y_pred>thresh)==y_true.bool()).float().mean().item()
 37 | 
 38 | def weighted_f1_loss(y_pred, y_true, weight=2):
 39 |     y_pred = torch.from_numpy(y_pred)
 40 |     y_pred = y_pred.sigmoid()
 41 |     y_pred[y_pred>=0.5] = 1
 42 |     y_pred[y_pred<0.5] = 0
 43 | 
 44 |     loss = 0
 45 |     f1_scores = []
 46 |     for i in range(len(y_true[0])):
 47 |         f1 = f1_score(y_true[:,i],y_pred.int().numpy()[:,i])
 48 |         f1_scores.append(f'{f1:9.4f}')
 49 |         loss += weight*(1 -f1)
 50 |     #print(loss, f1_scores)
 51 |     return loss
 52 | 
 53 | def compute_metrics(eval_pred):
 54 |     predictions, labels = eval_pred
 55 |     accuracy_thresh_value = accuracy_thresh(predictions, labels)
 56 |     weighted_f1_loss_value = weighted_f1_loss(predictions, labels)
 57 |     return {'accuracy_thresh': accuracy_thresh_value, 'f1_loss':weighted_f1_loss_value}    
 58 |     
 59 |     
 60 | def compute_fine_metrics2(eval_pred,emotions):
 61 |     metrics_result = {
 62 |         "f1": [],
 63 |         "precision": [],
 64 |         "recall": [],
 65 |         "f1_micro": [],
 66 |         "f1_macro": [],
 67 |         "f1_weighted": [],
 68 |     }
 69 |     predictions = eval_pred.predictions
 70 |     labels = eval_pred.label_ids
 71 |     predictions = torch.tensor(predictions)
 72 | 
 73 |     preds_full = torch.sigmoid(predictions).cpu().detach().numpy().tolist()
 74 | 
 75 |     preds_full = np.array(preds_full) >= 0.5
 76 |     labels = np.array(labels) >= 0.5
 77 | 
 78 |     for i, label in enumerate(emotions):
 79 |         column_preds = preds_full[:, i]
 80 |         column_labels = labels[:, i]
 81 |         prf1 = metrics.precision_recall_fscore_support(
 82 |             column_labels, column_preds, average="binary"
 83 |         )
 84 |         metrics_result["f1"].append(prf1[2])
 85 |         metrics_result["precision"].append(prf1[0])
 86 |         metrics_result["recall"].append(prf1[1])
 87 |         metrics_result["f1_micro"].append(
 88 |             metrics.f1_score(column_labels, column_preds, average="micro")
 89 |         )
 90 |         metrics_result["f1_macro"].append(
 91 |             metrics.f1_score(column_labels, column_preds, average="macro")
 92 |         )
 93 |         metrics_result["f1_weighted"].append(
 94 |             metrics.f1_score(column_labels, column_preds, average="weighted")
 95 |         )
 96 | 
 97 |     return metrics_result    
 98 | 
 99 | def compute_metrics_single(pred):
100 |     labels = pred.label_ids
101 |     preds = pred.predictions.argmax(-1)
102 |     precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
103 |     acc = accuracy_score(labels, preds)
104 |     f1_score_micro = f1_score(labels, preds, average='micro')
105 |     f1_score_macro = f1_score(labels, preds, average='macro')
106 |     f1_score_weighted = f1_score(labels, preds, average='weighted')
107 |     return {
108 |         'accuracy': acc,
109 |         'precision': precision,
110 |         'recall': recall,
111 |         'f1': f1,
112 |         'f1_micro': f1_score_micro,
113 |         'f1_macro': f1_score_macro,
114 |         'f1_weighted': f1_score_weighted,
115 |     }


--------------------------------------------------------------------------------
/03_electra/models/final/german-nlp-group/electra-base-german-uncased/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name_or_path": "./results/checkpoint-753",
 3 |   "architectures": [
 4 |     "ElectraForSequenceClassification"
 5 |   ],
 6 |   "attention_probs_dropout_prob": 0.1,
 7 |   "embedding_size": 768,
 8 |   "hidden_act": "gelu",
 9 |   "hidden_dropout_prob": 0.1,
10 |   "hidden_size": 768,
11 |   "id2label": {
12 |     "0": "LABEL_0",
13 |     "1": "LABEL_1",
14 |     "2": "LABEL_2",
15 |     "3": "LABEL_3",
16 |     "4": "LABEL_4",
17 |     "5": "LABEL_5",
18 |     "6": "LABEL_6",
19 |     "7": "LABEL_7"
20 |   },
21 |   "initializer_range": 0.02,
22 |   "intermediate_size": 3072,
23 |   "label2id": {
24 |     "LABEL_0": 0,
25 |     "LABEL_1": 1,
26 |     "LABEL_2": 2,
27 |     "LABEL_3": 3,
28 |     "LABEL_4": 4,
29 |     "LABEL_5": 5,
30 |     "LABEL_6": 6,
31 |     "LABEL_7": 7
32 |   },
33 |   "layer_norm_eps": 1e-12,
34 |   "max_position_embeddings": 512,
35 |   "model_type": "electra",
36 |   "num_attention_heads": 12,
37 |   "num_hidden_layers": 12,
38 |   "pad_token_id": 0,
39 |   "position_embedding_type": "absolute",
40 |   "summary_activation": "gelu",
41 |   "summary_last_dropout": 0.1,
42 |   "summary_type": "first",
43 |   "summary_use_proj": true,
44 |   "transformers_version": "4.5.0",
45 |   "type_vocab_size": 2,
46 |   "vocab_size": 32767
47 | }
48 | 


--------------------------------------------------------------------------------
/03_electra/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas==1.3.4
2 | numpy==1.19.5
3 | torch==1.10.0
4 | tqdm==4.62.3
5 | transformers==4.12.0
6 | datasets==1.14.0
7 | scikit-learn==1.0
8 | openpyxl==3.0.9
9 | seaborn==0.11.2


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 3x8emotions
  2 | ================
  3 | Tobias Widmann & Maximilian Wich
  4 | June 2022
  5 | 
  6 | Repo containing code and models for 3 different tools to measure appeals
  7 | to 8 discrete emotions in *German political text*, as described and
  8 | validated in the following article:
  9 | 
 10 | *Widmann, Tobias, and Maximilian Wich. "Creating and Comparing
 11 | Dictionary, Word Embedding, and Transformer-Based Models to Measure
 12 | Discrete Emotions in German Political Text." Political Analysis, June
 13 | 29, 2022, 1--16. <https://doi.org/10.1017/pan.2022.15>*
 14 | 
 15 | Please start by reading this article which contains information about
 16 | the creation and performance of the different tools. These tools are
 17 | free to use for academic research. **In case you use one or multiple of
 18 | these, please always cite the article above.**
 19 | 
 20 | **Important:** Download the files by clicking on the latest release on the right side. Two files need to be downloaded additionally due to size limitations: the ELECTRA model (pytorch_model.bin) and the locally trained word embeddings (vec_ed_preprocessed.txt). The folder contains all scripts to apply the (1) ed8
 21 | dictionary, (2) the neural network models based on locally trained word
 22 | embeddings and (3) the ELECTRA model. 
 23 | 
 24 | 
 25 | ## (1) ed8
 26 | 
 27 | The `ed8 dictionary` is provided in YAML format and can be applied via
 28 | the `quanteda` package. The dictionary and the R script `apply_ed8.R` to apply the
 29 | dictionary to a data frame with a ‘text’ column can be found in the
 30 | folder “./01_ed8”.
 31 | 
 32 | ``` r
 33 | # First, load quanteda package
 34 | library(quanteda)
 35 | 
 36 | #  Load in dictionary
 37 | ed8 <- dictionary(file = "./ed8.yml",
 38 |                   format = "YAML")
 39 | 
 40 | # Create the function
 41 | get_ed8_emotions <- function(data){
 42 |   #Create a corpus from your data frame
 43 |   corp <- corpus(data)
 44 |   
 45 |   #Tokenize corpus and pre-process (remove punctuations, numbers, and urls)
 46 |   toks <- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE, remove_url = TRUE)
 47 |   
 48 |   #Create DFM just to measure number of terms before removing stopwords
 49 |   terms_dfm <- dfm(toks)
 50 |   
 51 |   #Create bigram-compounds to include negation control
 52 |   toks_neg_bigram <- tokens_compound(toks, pattern = phrase("nicht *"))
 53 |   toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("nichts *"))
 54 |   toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("kein *"))
 55 |   toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("keine *"))
 56 |   toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("keinen *"))
 57 |   
 58 |   #Turn tokens into DFM, remove stopwords
 59 |   emo_dfm <- dfm(toks_neg_bigram, remove = stopwords("de"))
 60 |   
 61 |   #Apply dictionary
 62 |   dict_dfm_results <- dfm_lookup(emo_dfm,ed8)
 63 |   
 64 |   #Convert results back to data frame
 65 |   results_df <- cbind(data, convert(dict_dfm_results, to = 'data.frame'))
 66 |   
 67 |   #Assign length to each documents
 68 |   results_df$terms_raw <- ntoken(terms_dfm)
 69 |   results_df$terms <- ntoken(emo_dfm)
 70 |   
 71 |   return(results_df)
 72 | }
 73 | 
 74 | # Now you can use the function on your data; simply enter a data frame with a column called "text" including the text data
 75 | results <- get_ed8_emotions(data)
 76 | 
 77 | # Finally, you can create normalized emotional scores by dividing the ed8-scores by document length
 78 | results$anger.norm <- results$ed8.ANGER / results$terms
 79 | results$fear.norm <- results$ed8.FEAR / results$terms
 80 | results$disgust.norm <- results$ed8.DISGUST / results$terms
 81 | results$sadness.norm <- results$ed8.SADNESS / results$terms
 82 | results$joy.norm <- results$ed8.JOY / results$terms
 83 | results$enthusiasm.norm <- results$ed8.ENTHUSIASM / results$terms
 84 | results$pride.norm <- results$ed8.PRIDE / results$terms
 85 | results$hope.norm <- results$ed8.HOPE / results$terms
 86 | ```
 87 | 
 88 | ## (2) Neural Network Classifiers
 89 | 
 90 | The neural network classifiers and locally trained word embedding model
 91 | are provided in the folder “./02_neuralnet”. The code for turning text into
 92 | numerical vectors and subsequently applying the neural network
 93 | classifiers can be found in the R script `apply_neuralnet.R`. Remember, the machine learning models were trained on sentences, so you need to bring your text data on sentence level first.
 94 | 
 95 | ``` r
 96 | # Load necessary packages
 97 | library(quanteda)
 98 | library(corpus)
 99 | library(keras)
100 | library(tidytext)
101 | 
102 | # Set working directory
103 | setwd("./neuralnet")
104 | 
105 | 
106 | # First, you need to turn your text into sentences
107 | data <- data %>% 
108 |   unnest_tokens(sentences, text, "sentences")
109 | 
110 | # Now, you can turn your text documents in a corpus
111 | corp <- corpus(data$sentences)
112 | 
113 | # Create a document feature matrix and conduct pre-processing
114 | text_dfm <- dfm(corp, remove=stopwords("german"), verbose=TRUE, tolower = TRUE)
115 | 
116 | # Stemming
117 | text_dfm <- dfm_wordstem(text_dfm, language = "german")
118 | 
119 | # Now, we will convert the word embeddings into a data frame 
120 | # and match the features from each document with their corresponding embeddings
121 | 
122 | #F irst, we load the locally trained word embeddings into R
123 | w2v <- readr::read_delim("./vec_ed_preprocessed.txt", 
124 |                          skip=1, delim=" ", quote="",
125 |                          col_names=c("word", paste0("V", 1:100)))
126 | 
127 | # Stem the terms included in the embeddings to increase matches
128 | w2v$word <- text_tokens(w2v$word, stemmer = "de")
129 | 
130 | # creating new feature matrix for embeddings
131 | embed <- matrix(NA, nrow=ndoc(text_dfm), ncol=100)
132 | for (i in 1:ndoc(cgdfm)){
133 |   if (i %% 100 == 0) message(i, '/', ndoc(text_dfm))
134 |   # extract word counts
135 |   vec <- as.numeric(text_dfm[i,])
136 |   # keep words with counts of 1 or more
137 |   doc_words <- featnames(text_dfm)[vec>0]
138 |   # extract embeddings for those words
139 |   embed_vec <- w2v[w2v$word %in% doc_words, 2:101]
140 |   # aggregate from word- to document-level embeddings by taking AVG
141 |   embed[i,] <- colMeans(embed_vec, na.rm=TRUE)
142 |   # if no words in embeddings, simply set to 0
143 |   if (nrow(embed_vec)==0) embed[i,] <- 0
144 | }
145 | 
146 | # After you created the sentence embeddings, you can apply the trained machine learning models for each emotion
147 | # The machine learning models are provided in the folder "./neuralnet/models"
148 | # for example, anger:
149 | 
150 | model <- load_model_hdf5("./models/keras_anger", custom_objects = NULL, compile = TRUE)
151 | wb.anger <- model %>% predict_classes(embed)
152 | data <- cbind(data, wb.anger)
153 | ```
154 | 
155 | ## (3) ELECTRA Model
156 | 
157 | The ELECTRA files are provided in the folder `./03_electra`. The model can
158 | be applied to text data using the Python code as shown in the Python notebook `apply_electra.ipynb`. The ELECTRA model was trained on sentences, so you need to bring your text data on sentence level first.
159 | 
160 | 
161 | ``` python
162 | # Set working directory
163 | %cd /03_electra/
164 | ```
165 | 
166 | ``` python
167 | # load necessary modules
168 | import transformers
169 | import pandas as pd
170 | ```
171 | 
172 | ``` python
173 | # text documents the model will be applied to
174 | df = pd.read_csv('./data.csv')
175 | documents = data.text
176 | ```
177 | 
178 | ``` python
179 | # load inferencer
180 | from helper.inferencing import Inferencer
181 | ```
182 | 
183 | ``` python
184 | # predicting
185 | predictor = Inferencer()
186 | df_results = predictor.predict_dataframe(documents)
187 | ```
188 | 
189 | ``` python
190 | # show results
191 | df_results
192 | ```
193 | 


--------------------------------------------------------------------------------