├── 01_ed8 ├── apply_ed8.R └── ed8.yml ├── 02_neuralnet ├── apply_neuralnet.R └── models │ ├── anger │ ├── disgust │ ├── enthusiasm │ ├── fear │ ├── hope │ ├── joy │ ├── pride │ └── sadness ├── 03_electra ├── apply_electra.ipynb ├── helper │ ├── __pycache__ │ │ ├── inferencing.cpython-37.pyc │ │ ├── inferencing.cpython-39.pyc │ │ ├── training.cpython-37.pyc │ │ └── training.cpython-39.pyc │ ├── data_preparation.py │ ├── inferencing.py │ └── training.py ├── models │ └── final │ │ └── german-nlp-group │ │ └── electra-base-german-uncased │ │ └── config.json └── requirements.txt └── README.md /01_ed8/apply_ed8.R: -------------------------------------------------------------------------------- 1 | ################################################################## 2 | ################################################################## 3 | ## Widmann & Wich: Creating and Comparing Dictionary, Word Embedding, and Transformer-based 4 | ## Models to Measure Discrete Emotions in German Political Text 5 | ## Political Analysis 6 | ## widmann@ps.au.dk 7 | ################################################################## 8 | ################################################################## 9 | 10 | 11 | #### Applying ed8 dictionary 12 | 13 | #### FUNCTION ed8 ################################# 14 | 15 | # First, load quanteda package 16 | library(quanteda) 17 | 18 | 19 | # Load in dictionary 20 | ed8 <- dictionary(file = "./ed8.yml", 21 | format = "YAML") 22 | 23 | # Create the function 24 | get_ed8_emotions <- function(data){ 25 | #Create a corpus from your data frame 26 | corp <- corpus(data) 27 | 28 | #Tokenize corpus and pre-process (remove punctuations, numbers, and urls) 29 | toks <- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE, remove_url = TRUE) 30 | 31 | #Create DFM just to measure number of terms before removing stopwords 32 | terms_dfm <- dfm(toks) 33 | 34 | #Create bigram-compounds to include negation control 35 | toks_neg_bigram <- tokens_compound(toks, pattern = phrase("nicht *")) 36 | toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("nichts *")) 37 | toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("kein *")) 38 | toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("keine *")) 39 | toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("keinen *")) 40 | 41 | #Turn tokens into DFM, remove stopwords 42 | emo_dfm <- dfm(toks_neg_bigram, remove = stopwords("de")) 43 | 44 | #Apply dictionary 45 | dict_dfm_results <- dfm_lookup(emo_dfm,ed8) 46 | 47 | #Convert results back to data frame 48 | results_df <- cbind(data, convert(dict_dfm_results, to = 'data.frame')) 49 | 50 | #Assign length to each documents 51 | results_df$terms_raw <- ntoken(terms_dfm) 52 | results_df$terms <- ntoken(emo_dfm) 53 | 54 | return(results_df) 55 | } 56 | 57 | 58 | # Now you can use the function on your data; simply enter a data frame with a column called "text" including the text data 59 | results <- get_ed8_emotions(data) 60 | 61 | # Finally, you can create normalized emotional scores by dividing the ed8-scores by document length 62 | results$anger.norm <- results$ed8.ANGER / results$terms 63 | results$fear.norm <- results$ed8.FEAR / results$terms 64 | results$disgust.norm <- results$ed8.DISGUST / results$terms 65 | results$sadness.norm <- results$ed8.SADNESS / results$terms 66 | results$joy.norm <- results$ed8.JOY / results$terms 67 | results$enthusiasm.norm <- results$ed8.ENTHUSIASM / results$terms 68 | results$pride.norm <- results$ed8.PRIDE / results$terms 69 | results$hope.norm <- results$ed8.HOPE / results$terms 70 | -------------------------------------------------------------------------------- /02_neuralnet/apply_neuralnet.R: -------------------------------------------------------------------------------- 1 | 2 | ################################################################## 3 | ################################################################## 4 | ## Widmann & Wich: Creating and Comparing Dictionary, Word Embedding, and Transformer-based 5 | ## Models to Measure Discrete Emotions in German Political Text 6 | ## Political Analysis 7 | ## widmann@ps.au.dk 8 | ################################################################## 9 | ################################################################## 10 | 11 | #### Applying Neural Network Classifiers based on Locally Trained German Word Embeddings 12 | 13 | # Load necessary packages 14 | library(quanteda) 15 | library(corpus) 16 | library(keras) 17 | library(tidytext) 18 | 19 | # Set working directory 20 | setwd("./neuralnet") 21 | 22 | 23 | # First, you need to turn your text into sentences 24 | data <- data %>% 25 | unnest_tokens(sentences, text, "sentences") 26 | 27 | # Now, you can turn your text documents in a corpus 28 | corp <- corpus(data$sentences) 29 | 30 | # Create a document feature matrix and conduct pre-processing 31 | text_dfm <- dfm(corp, remove=stopwords("german"), verbose=TRUE, tolower = TRUE) 32 | 33 | # Stemming 34 | text_dfm <- dfm_wordstem(text_dfm, language = "german") 35 | 36 | # Now, we will convert the word embeddings into a data frame 37 | # and match the features from each document with their corresponding embeddings 38 | 39 | #F irst, we load the locally trained word embeddings into R 40 | w2v <- readr::read_delim("./vec_ed_preprocessed.txt", 41 | skip=1, delim=" ", quote="", 42 | col_names=c("word", paste0("V", 1:100))) 43 | 44 | # Stem the terms included in the embeddings to increase matches 45 | w2v$word <- text_tokens(w2v$word, stemmer = "de") 46 | 47 | # creating new feature matrix for embeddings 48 | embed <- matrix(NA, nrow=ndoc(text_dfm), ncol=100) 49 | for (i in 1:ndoc(cgdfm)){ 50 | if (i %% 100 == 0) message(i, '/', ndoc(text_dfm)) 51 | # extract word counts 52 | vec <- as.numeric(text_dfm[i,]) 53 | # keep words with counts of 1 or more 54 | doc_words <- featnames(text_dfm)[vec>0] 55 | # extract embeddings for those words 56 | embed_vec <- w2v[w2v$word %in% doc_words, 2:101] 57 | # aggregate from word- to document-level embeddings by taking AVG 58 | embed[i,] <- colMeans(embed_vec, na.rm=TRUE) 59 | # if no words in embeddings, simply set to 0 60 | if (nrow(embed_vec)==0) embed[i,] <- 0 61 | } 62 | 63 | # After you created the sentence embeddings, you can apply the trained machine learning models for each emotion 64 | # The machine learning models are provided in the folder "./neuralnet/models" 65 | # for example, anger: 66 | 67 | model <- load_model_hdf5("./models/keras_anger", custom_objects = NULL, compile = TRUE) 68 | wb.anger <- model %>% predict_classes(embed) 69 | data <- cbind(data, wb.anger) -------------------------------------------------------------------------------- /02_neuralnet/models/anger: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/anger -------------------------------------------------------------------------------- /02_neuralnet/models/disgust: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/disgust -------------------------------------------------------------------------------- /02_neuralnet/models/enthusiasm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/enthusiasm -------------------------------------------------------------------------------- /02_neuralnet/models/fear: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/fear -------------------------------------------------------------------------------- /02_neuralnet/models/hope: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/hope -------------------------------------------------------------------------------- /02_neuralnet/models/joy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/joy -------------------------------------------------------------------------------- /02_neuralnet/models/pride: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/pride -------------------------------------------------------------------------------- /02_neuralnet/models/sadness: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/02_neuralnet/models/sadness -------------------------------------------------------------------------------- /03_electra/apply_electra.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"cell_type":"code","execution_count":null,"id":"1dcf7613","metadata":{"id":"1dcf7613","outputId":"35148b94-b9cc-4ae0-d8a2-38f13ee2cde3"},"outputs":[{"name":"stdout","output_type":"stream","text":["/Volumes/GoogleDrive/My Drive/Work/Tools/3x8emotions/03_electra\n"]}],"source":["# Set working directory to the folder 'electra' which is located in the replication folder\n","%cd /03_electra"]},{"cell_type":"markdown","id":"ce8c32d1-2b11-47f5-af78-63a5a18d13aa","metadata":{"id":"ce8c32d1-2b11-47f5-af78-63a5a18d13aa"},"source":["To install all required packages listed in requirements.txt you can simply use the following pip-command: pip install -r requirements.txt"]},{"cell_type":"code","execution_count":null,"id":"cf55407a","metadata":{"id":"cf55407a","outputId":"c359bcf2-6889-444f-a38b-d18813f984be"},"outputs":[{"ename":"ModuleNotFoundError","evalue":"No module named 'transformers'","output_type":"error","traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)","\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mtransformers\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtorch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0msklearn\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'transformers'"]}],"source":["import transformers\n","import pandas as pd"]},{"cell_type":"markdown","id":"41c8323e-de7d-4f26-850a-ff937c6ee266","metadata":{"id":"41c8323e-de7d-4f26-850a-ff937c6ee266"},"source":["# Inferecing New Data"]},{"cell_type":"code","execution_count":null,"id":"61310d76","metadata":{"id":"61310d76","outputId":"f0ab6327-06b0-47cb-e757-d396b7f0b03a"},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
Unnamed: 0TextIDTextAnswer.1Answer.2Answer.3Answer.4h_angerh_fearh_disgust...hf_hopedf_hopewb.angerwb.fearwb.disgustwb.sadnesswb.joywb.enthusiasmwb.pridewb.hope
0142019127_0016sanktionen sind immer die schlechteste option ...ÄrgerNaNNaNNaN200...0011000000
1202019127_0022fremdenfeindlichkeit rassismus hass und ressen...ÄrgerNaNNaNNaN301...0011100000
2252019127_0026das muss auch die ehemalige weinkönigin verstehenKeine EmotionNaNNaNNaN100...0000000000
3282019127_0029deshalb die linke wählen zb am 28Keine EmotionNaNNaNNaN000...1000000000
4302019127_0031die große koalition bringt nun einen antrag fü...Keine EmotionNaNNaNNaN000...0000000000
..................................................................
985110032019127_9906die betroffenen haben heute eine ähnliche lebe...StolzNaNNaNNaN000...1100000000
986110082019127_9911da ist für die linke ganz klar durch kluge dip...HoffnungNaNNaNNaN000...1000000110
987110162019127_9918da haben sie mich leider enttäuschtÄrgerNaNNaNNaN200...0000010000
988110272019127_9928ich will es ganz deutlich sagen das gesamte ve...ÄrgerNaNNaNNaN500...0010000000
989110862019127_9981union und spd hatten den gesetzentwurf aber im...ÄrgerNaNNaNNaN400...0010010000
\n","

990 rows × 86 columns

\n","
"],"text/plain":[" Unnamed: 0 TextID \\\n","0 14 2019127_0016 \n","1 20 2019127_0022 \n","2 25 2019127_0026 \n","3 28 2019127_0029 \n","4 30 2019127_0031 \n",".. ... ... \n","985 11003 2019127_9906 \n","986 11008 2019127_9911 \n","987 11016 2019127_9918 \n","988 11027 2019127_9928 \n","989 11086 2019127_9981 \n","\n"," Text Answer.1 \\\n","0 sanktionen sind immer die schlechteste option ... Ärger \n","1 fremdenfeindlichkeit rassismus hass und ressen... Ärger \n","2 das muss auch die ehemalige weinkönigin verstehen Keine Emotion \n","3 deshalb die linke wählen zb am 28 Keine Emotion \n","4 die große koalition bringt nun einen antrag fü... Keine Emotion \n",".. ... ... \n","985 die betroffenen haben heute eine ähnliche lebe... Stolz \n","986 da ist für die linke ganz klar durch kluge dip... Hoffnung \n","987 da haben sie mich leider enttäuscht Ärger \n","988 ich will es ganz deutlich sagen das gesamte ve... Ärger \n","989 union und spd hatten den gesetzentwurf aber im... Ärger \n","\n"," Answer.2 Answer.3 Answer.4 h_anger h_fear h_disgust ... hf_hope \\\n","0 NaN NaN NaN 2 0 0 ... 0 \n","1 NaN NaN NaN 3 0 1 ... 0 \n","2 NaN NaN NaN 1 0 0 ... 0 \n","3 NaN NaN NaN 0 0 0 ... 1 \n","4 NaN NaN NaN 0 0 0 ... 0 \n",".. ... ... ... ... ... ... ... ... \n","985 NaN NaN NaN 0 0 0 ... 1 \n","986 NaN NaN NaN 0 0 0 ... 1 \n","987 NaN NaN NaN 2 0 0 ... 0 \n","988 NaN NaN NaN 5 0 0 ... 0 \n","989 NaN NaN NaN 4 0 0 ... 0 \n","\n"," df_hope wb.anger wb.fear wb.disgust wb.sadness wb.joy wb.enthusiasm \\\n","0 0 1 1 0 0 0 0 \n","1 0 1 1 1 0 0 0 \n","2 0 0 0 0 0 0 0 \n","3 0 0 0 0 0 0 0 \n","4 0 0 0 0 0 0 0 \n",".. ... ... ... ... ... ... ... \n","985 1 0 0 0 0 0 0 \n","986 0 0 0 0 0 0 1 \n","987 0 0 0 0 1 0 0 \n","988 0 1 0 0 0 0 0 \n","989 0 1 0 0 1 0 0 \n","\n"," wb.pride wb.hope \n","0 0 0 \n","1 0 0 \n","2 0 0 \n","3 0 0 \n","4 0 0 \n",".. ... ... \n","985 0 0 \n","986 1 0 \n","987 0 0 \n","988 0 0 \n","989 0 0 \n","\n","[990 rows x 86 columns]"]},"execution_count":3,"metadata":{},"output_type":"execute_result"}],"source":["# Load in your data (e.g. as a csv file)\n","df = pd.read_csv('./data.csv') \n","documents = list(df.text) "]},{"cell_type":"code","execution_count":null,"id":"d5052e91","metadata":{"id":"d5052e91"},"outputs":[],"source":["from helper.inferencing import Inferencer"]},{"cell_type":"code","execution_count":null,"id":"615ff91d","metadata":{"id":"615ff91d"},"outputs":[],"source":["# Predicting\n","predictor = Inferencer()\n","df = predictor.predict_dataframe(documents)"]},{"cell_type":"code","execution_count":null,"id":"85b19294","metadata":{"id":"85b19294","outputId":"b201074e-51d3-4372-df84-8b043e163a0c"},"outputs":[{"data":{"text/html":["
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
textangerfeardisgustsadnessjoyenthusiasmpridehope
0sanktionen sind immer die schlechteste option ...1.01.00.00.00.00.00.00.0
1fremdenfeindlichkeit rassismus hass und ressen...1.01.01.01.00.00.00.00.0
2das muss auch die ehemalige weinkönigin verstehen0.00.00.00.00.00.00.00.0
3deshalb die linke wählen zb am 280.00.00.00.00.00.00.00.0
4die große koalition bringt nun einen antrag fü...0.00.00.00.01.00.01.00.0
..............................
985die betroffenen haben heute eine ähnliche lebe...0.00.00.00.01.00.01.00.0
986da ist für die linke ganz klar durch kluge dip...0.00.00.00.00.01.01.00.0
987da haben sie mich leider enttäuscht1.00.00.01.00.00.00.00.0
988ich will es ganz deutlich sagen das gesamte ve...1.00.00.00.00.00.00.00.0
989union und spd hatten den gesetzentwurf aber im...1.00.00.01.00.00.00.00.0
\n","

990 rows × 9 columns

\n","
"],"text/plain":[" text anger fear disgust \\\n","0 sanktionen sind immer die schlechteste option ... 1.0 1.0 0.0 \n","1 fremdenfeindlichkeit rassismus hass und ressen... 1.0 1.0 1.0 \n","2 das muss auch die ehemalige weinkönigin verstehen 0.0 0.0 0.0 \n","3 deshalb die linke wählen zb am 28 0.0 0.0 0.0 \n","4 die große koalition bringt nun einen antrag fü... 0.0 0.0 0.0 \n",".. ... ... ... ... \n","985 die betroffenen haben heute eine ähnliche lebe... 0.0 0.0 0.0 \n","986 da ist für die linke ganz klar durch kluge dip... 0.0 0.0 0.0 \n","987 da haben sie mich leider enttäuscht 1.0 0.0 0.0 \n","988 ich will es ganz deutlich sagen das gesamte ve... 1.0 0.0 0.0 \n","989 union und spd hatten den gesetzentwurf aber im... 1.0 0.0 0.0 \n","\n"," sadness joy enthusiasm pride hope \n","0 0.0 0.0 0.0 0.0 0.0 \n","1 1.0 0.0 0.0 0.0 0.0 \n","2 0.0 0.0 0.0 0.0 0.0 \n","3 0.0 0.0 0.0 0.0 0.0 \n","4 0.0 1.0 0.0 1.0 0.0 \n",".. ... ... ... ... ... \n","985 0.0 1.0 0.0 1.0 0.0 \n","986 0.0 0.0 1.0 1.0 0.0 \n","987 1.0 0.0 0.0 0.0 0.0 \n","988 0.0 0.0 0.0 0.0 0.0 \n","989 1.0 0.0 0.0 0.0 0.0 \n","\n","[990 rows x 9 columns]"]},"execution_count":6,"metadata":{},"output_type":"execute_result"}],"source":["# Show results\n","df"]},{"cell_type":"code","execution_count":null,"id":"eac44d08-2612-4699-a06d-ee78e77359d9","metadata":{"id":"eac44d08-2612-4699-a06d-ee78e77359d9"},"outputs":[],"source":["# Save results, e.g. as .csv file \n","df.to_csv('./electra_results.csv')"]}],"metadata":{"environment":{"kernel":"python3","name":"common-cu110.m87","type":"gcloud","uri":"gcr.io/deeplearning-platform-release/base-cu110:m87"},"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.8.8"},"colab":{"name":"apply_electra.ipynb","provenance":[],"collapsed_sections":[]}},"nbformat":4,"nbformat_minor":5} -------------------------------------------------------------------------------- /03_electra/helper/__pycache__/inferencing.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/03_electra/helper/__pycache__/inferencing.cpython-37.pyc -------------------------------------------------------------------------------- /03_electra/helper/__pycache__/inferencing.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/03_electra/helper/__pycache__/inferencing.cpython-39.pyc -------------------------------------------------------------------------------- /03_electra/helper/__pycache__/training.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/03_electra/helper/__pycache__/training.cpython-37.pyc -------------------------------------------------------------------------------- /03_electra/helper/__pycache__/training.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tweedmann/3x8emotions/1e54ee6d69af459a24a1c304591eee7bcea59027/03_electra/helper/__pycache__/training.cpython-39.pyc -------------------------------------------------------------------------------- /03_electra/helper/data_preparation.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | import torch 6 | from datasets import load_dataset 7 | from sklearn.model_selection import train_test_split 8 | from torch.utils.data import ( 9 | DataLoader, 10 | Dataset, 11 | RandomSampler, 12 | SequentialSampler, 13 | random_split, 14 | ) 15 | 16 | 17 | class HateDataset(Dataset): 18 | def __init__(self, encodings, labels): 19 | self.encodings = encodings 20 | self.labels = labels 21 | 22 | def __getitem__(self, idx): 23 | item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} 24 | item["labels"] = torch.tensor(self.labels[idx]) 25 | return item 26 | 27 | def __len__(self): 28 | return len(self.labels) 29 | 30 | def getTempFiles( 31 | path_train, 32 | path_test, 33 | path_tmp, 34 | path_validation=None, 35 | frac_train=1.0, 36 | frac_val=0.1, 37 | seed=123, 38 | ): 39 | Path(path_tmp).mkdir(parents=True, exist_ok=True) 40 | path_tmp_train = Path(path_tmp) / "train.csv" 41 | path_tmp_validation = Path(path_tmp) / "validation.csv" 42 | path_tmp_test = Path(path_tmp) / "test.csv" 43 | 44 | df = pd.read_csv(path_train, encoding="utf-8", sep="\t") 45 | df = df.sample(frac=frac_train, random_state=seed) 46 | if path_validation is None: 47 | df_train, df_validation = train_test_split( 48 | df, test_size=frac_val, random_state=seed 49 | ) 50 | else: 51 | df_train = df 52 | df_validation = pd.read_csv(path_validation, encoding="utf-8", sep="\t") 53 | df_validation = df_validation.sample( 54 | n=int(frac_val * len(df_train)), random_state=seed 55 | ) 56 | 57 | df_test = pd.read_csv(path_test, encoding="utf-8", sep="\t") 58 | 59 | df_train.to_csv(path_tmp_train, encoding="utf-8", sep="\t", index=False) 60 | df_validation.to_csv(path_tmp_validation, encoding="utf-8", sep="\t", index=False) 61 | df_test.to_csv(path_tmp_test, encoding="utf-8", sep="\t", index=False) 62 | 63 | print(len(df_train), len(df_validation), len(df_test)) 64 | return path_tmp_train, path_tmp_validation, path_tmp_test 65 | 66 | 67 | def getHateDatasets(data_params, selected_dataset, tokenizer): 68 | label_name = data_params[selected_dataset]["label"] 69 | text_name = data_params[selected_dataset]["text"] 70 | 71 | def preprocess(row): 72 | # preprocess text 73 | row["text"] = re.sub(r"\|lbr\||\|LBR\||\|AMP\||>|&", " ", row["text"]) 74 | row["text"] = re.sub(r"(^|\s)@[A-Za-z0-9_-]*", " ", row["text"]) 75 | # convert string label to integer 76 | # mapping is stored in model_params 77 | selected = data_params[selected_dataset] 78 | row[selected["label"]] = selected["mapping"][row[selected["label"]]] 79 | return row 80 | 81 | #print(data_params[selected_dataset]["train"]) 82 | #print(data_params[selected_dataset]["validation"]) 83 | #print(data_params[selected_dataset]["test"]) 84 | # load data 85 | dataset = load_dataset( 86 | "csv", 87 | data_files={ 88 | "train": data_params[selected_dataset]["train"], 89 | "validation": data_params[selected_dataset]["validation"], 90 | "test": data_params[selected_dataset]["test"], 91 | }, 92 | delimiter="\t", 93 | ) 94 | 95 | # preprocess data 96 | dataset["train"] = dataset["train"].map(preprocess) 97 | dataset["validation"] = dataset["validation"].map(preprocess) 98 | dataset["test"] = dataset["test"].map(preprocess) 99 | 100 | # tokenize data 101 | train_encodings = tokenizer( 102 | dataset["train"][text_name], truncation=True, padding=True 103 | ) 104 | val_encodings = tokenizer( 105 | dataset["validation"][text_name], truncation=True, padding=True 106 | ) 107 | test_encodings = tokenizer( 108 | dataset["test"][text_name], truncation=True, padding=True 109 | ) 110 | 111 | train_dataset = HateDataset(train_encodings, dataset["train"][label_name]) 112 | val_dataset = HateDataset(val_encodings, dataset["validation"][label_name]) 113 | test_dataset = HateDataset(test_encodings, dataset["test"][label_name]) 114 | 115 | return train_dataset, val_dataset, test_dataset 116 | 117 | 118 | def getHateDataLoaders(data_params, selected_dataset, tokenizer, batch_size=128): 119 | train_dataset, val_dataset, test_dataset = getHateDatasets( 120 | data_params, selected_dataset, tokenizer 121 | ) 122 | train_loader = DataLoader( 123 | dataset=train_dataset, 124 | batch_size=batch_size, 125 | sampler=SequentialSampler(train_dataset), 126 | num_workers=8, 127 | ) 128 | val_loader = DataLoader( 129 | dataset=val_dataset, 130 | batch_size=batch_size, 131 | sampler=SequentialSampler(val_dataset), 132 | num_workers=8, 133 | ) 134 | test_loader = DataLoader( 135 | dataset=test_dataset, 136 | batch_size=batch_size, 137 | sampler=SequentialSampler(test_dataset), 138 | num_workers=8, 139 | ) 140 | 141 | return train_loader, val_loader, test_loader 142 | 143 | -------------------------------------------------------------------------------- /03_electra/helper/inferencing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import torch 4 | from transformers import ( 5 | AutoModel, 6 | AutoModelForSequenceClassification, 7 | AutoTokenizer, 8 | Trainer, 9 | TrainingArguments, 10 | set_seed, 11 | ) 12 | 13 | import helper.training as tr 14 | 15 | 16 | class Inferencer: 17 | def __init__(self): 18 | self.emotions = [ 19 | "anger", 20 | "fear", 21 | "disgust", 22 | "sadness", 23 | "joy", 24 | "enthusiasm", 25 | "pride", 26 | "hope", 27 | ] 28 | self.MODEL_NAME = "german-nlp-group/electra-base-german-uncased" 29 | self.DIR_TRAINED_MODEL = "./models/final" 30 | self.SEED = 7 31 | set_seed(self.SEED) 32 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 33 | 34 | self.model = AutoModelForSequenceClassification.from_pretrained( 35 | f"{self.DIR_TRAINED_MODEL}/{self.MODEL_NAME}", num_labels=8 36 | ).to(device=self.device) 37 | self.tokenizer = AutoTokenizer.from_pretrained(self.MODEL_NAME) 38 | 39 | def predict(self, x): 40 | val = [] 41 | for record in x: 42 | # tokenize document 43 | inputs = self.tokenizer( 44 | record, truncation=True, padding=True, return_tensors="pt" 45 | ) 46 | inputs = inputs.to(device=self.device) 47 | # inference 48 | outputs = self.model(**inputs) 49 | logits = outputs.logits 50 | prediction = logits.sigmoid() 51 | prediction[prediction >= 0.5] = 1 52 | prediction[prediction < 0.5] = 0 53 | prediction = prediction.detach().cpu().numpy() 54 | val.append(prediction[0]) 55 | return np.array(val) 56 | 57 | def predict_dataframe(self, x): 58 | predictions = self.predict(x) 59 | list_for_df = [] 60 | for i in range(len(x)): 61 | row = [*[x[i]], *predictions[i]] 62 | list_for_df.append(row) 63 | columns = ["text"] + self.emotions 64 | return pd.DataFrame(list_for_df, columns=columns) 65 | -------------------------------------------------------------------------------- /03_electra/helper/training.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from sklearn import metrics 4 | from sklearn.metrics import accuracy_score, precision_recall_fscore_support, f1_score 5 | from transformers import Trainer, TrainingArguments 6 | 7 | class EmotionDataset(torch.utils.data.Dataset): 8 | def __init__(self, encodings, labels): 9 | self.encodings = encodings 10 | self.labels = labels 11 | 12 | def __getitem__(self, idx): 13 | item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} 14 | item['labels'] = torch.tensor(self.labels[idx]) 15 | return item 16 | 17 | def __len__(self): 18 | return len(self.labels) 19 | 20 | class MultilabelTrainer(Trainer): 21 | def compute_loss(self, model, inputs, return_outputs=False): 22 | labels = inputs.pop("labels") 23 | outputs = model(**inputs) 24 | logits = outputs.logits 25 | loss_fct = torch.nn.BCEWithLogitsLoss() 26 | loss = loss_fct(logits.view(-1, self.model.config.num_labels), 27 | labels.float().view(-1, self.model.config.num_labels)) 28 | return (loss, outputs) if return_outputs else loss 29 | 30 | 31 | def accuracy_thresh(y_pred, y_true, thresh=0.5, sigmoid=True): 32 | y_pred = torch.from_numpy(y_pred) 33 | y_true = torch.from_numpy(y_true) 34 | if sigmoid: 35 | y_pred = y_pred.sigmoid() 36 | return ((y_pred>thresh)==y_true.bool()).float().mean().item() 37 | 38 | def weighted_f1_loss(y_pred, y_true, weight=2): 39 | y_pred = torch.from_numpy(y_pred) 40 | y_pred = y_pred.sigmoid() 41 | y_pred[y_pred>=0.5] = 1 42 | y_pred[y_pred<0.5] = 0 43 | 44 | loss = 0 45 | f1_scores = [] 46 | for i in range(len(y_true[0])): 47 | f1 = f1_score(y_true[:,i],y_pred.int().numpy()[:,i]) 48 | f1_scores.append(f'{f1:9.4f}') 49 | loss += weight*(1 -f1) 50 | #print(loss, f1_scores) 51 | return loss 52 | 53 | def compute_metrics(eval_pred): 54 | predictions, labels = eval_pred 55 | accuracy_thresh_value = accuracy_thresh(predictions, labels) 56 | weighted_f1_loss_value = weighted_f1_loss(predictions, labels) 57 | return {'accuracy_thresh': accuracy_thresh_value, 'f1_loss':weighted_f1_loss_value} 58 | 59 | 60 | def compute_fine_metrics2(eval_pred,emotions): 61 | metrics_result = { 62 | "f1": [], 63 | "precision": [], 64 | "recall": [], 65 | "f1_micro": [], 66 | "f1_macro": [], 67 | "f1_weighted": [], 68 | } 69 | predictions = eval_pred.predictions 70 | labels = eval_pred.label_ids 71 | predictions = torch.tensor(predictions) 72 | 73 | preds_full = torch.sigmoid(predictions).cpu().detach().numpy().tolist() 74 | 75 | preds_full = np.array(preds_full) >= 0.5 76 | labels = np.array(labels) >= 0.5 77 | 78 | for i, label in enumerate(emotions): 79 | column_preds = preds_full[:, i] 80 | column_labels = labels[:, i] 81 | prf1 = metrics.precision_recall_fscore_support( 82 | column_labels, column_preds, average="binary" 83 | ) 84 | metrics_result["f1"].append(prf1[2]) 85 | metrics_result["precision"].append(prf1[0]) 86 | metrics_result["recall"].append(prf1[1]) 87 | metrics_result["f1_micro"].append( 88 | metrics.f1_score(column_labels, column_preds, average="micro") 89 | ) 90 | metrics_result["f1_macro"].append( 91 | metrics.f1_score(column_labels, column_preds, average="macro") 92 | ) 93 | metrics_result["f1_weighted"].append( 94 | metrics.f1_score(column_labels, column_preds, average="weighted") 95 | ) 96 | 97 | return metrics_result 98 | 99 | def compute_metrics_single(pred): 100 | labels = pred.label_ids 101 | preds = pred.predictions.argmax(-1) 102 | precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary') 103 | acc = accuracy_score(labels, preds) 104 | f1_score_micro = f1_score(labels, preds, average='micro') 105 | f1_score_macro = f1_score(labels, preds, average='macro') 106 | f1_score_weighted = f1_score(labels, preds, average='weighted') 107 | return { 108 | 'accuracy': acc, 109 | 'precision': precision, 110 | 'recall': recall, 111 | 'f1': f1, 112 | 'f1_micro': f1_score_micro, 113 | 'f1_macro': f1_score_macro, 114 | 'f1_weighted': f1_score_weighted, 115 | } -------------------------------------------------------------------------------- /03_electra/models/final/german-nlp-group/electra-base-german-uncased/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name_or_path": "./results/checkpoint-753", 3 | "architectures": [ 4 | "ElectraForSequenceClassification" 5 | ], 6 | "attention_probs_dropout_prob": 0.1, 7 | "embedding_size": 768, 8 | "hidden_act": "gelu", 9 | "hidden_dropout_prob": 0.1, 10 | "hidden_size": 768, 11 | "id2label": { 12 | "0": "LABEL_0", 13 | "1": "LABEL_1", 14 | "2": "LABEL_2", 15 | "3": "LABEL_3", 16 | "4": "LABEL_4", 17 | "5": "LABEL_5", 18 | "6": "LABEL_6", 19 | "7": "LABEL_7" 20 | }, 21 | "initializer_range": 0.02, 22 | "intermediate_size": 3072, 23 | "label2id": { 24 | "LABEL_0": 0, 25 | "LABEL_1": 1, 26 | "LABEL_2": 2, 27 | "LABEL_3": 3, 28 | "LABEL_4": 4, 29 | "LABEL_5": 5, 30 | "LABEL_6": 6, 31 | "LABEL_7": 7 32 | }, 33 | "layer_norm_eps": 1e-12, 34 | "max_position_embeddings": 512, 35 | "model_type": "electra", 36 | "num_attention_heads": 12, 37 | "num_hidden_layers": 12, 38 | "pad_token_id": 0, 39 | "position_embedding_type": "absolute", 40 | "summary_activation": "gelu", 41 | "summary_last_dropout": 0.1, 42 | "summary_type": "first", 43 | "summary_use_proj": true, 44 | "transformers_version": "4.5.0", 45 | "type_vocab_size": 2, 46 | "vocab_size": 32767 47 | } 48 | -------------------------------------------------------------------------------- /03_electra/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.3.4 2 | numpy==1.19.5 3 | torch==1.10.0 4 | tqdm==4.62.3 5 | transformers==4.12.0 6 | datasets==1.14.0 7 | scikit-learn==1.0 8 | openpyxl==3.0.9 9 | seaborn==0.11.2 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 3x8emotions 2 | ================ 3 | Tobias Widmann & Maximilian Wich 4 | June 2022 5 | 6 | Repo containing code and models for 3 different tools to measure appeals 7 | to 8 discrete emotions in *German political text*, as described and 8 | validated in the following article: 9 | 10 | *Widmann, Tobias, and Maximilian Wich. "Creating and Comparing 11 | Dictionary, Word Embedding, and Transformer-Based Models to Measure 12 | Discrete Emotions in German Political Text." Political Analysis, June 13 | 29, 2022, 1--16. * 14 | 15 | Please start by reading this article which contains information about 16 | the creation and performance of the different tools. These tools are 17 | free to use for academic research. **In case you use one or multiple of 18 | these, please always cite the article above.** 19 | 20 | **Important:** Download the files by clicking on the latest release on the right side. Two files need to be downloaded additionally due to size limitations: the ELECTRA model (pytorch_model.bin) and the locally trained word embeddings (vec_ed_preprocessed.txt). The folder contains all scripts to apply the (1) ed8 21 | dictionary, (2) the neural network models based on locally trained word 22 | embeddings and (3) the ELECTRA model. 23 | 24 | 25 | ## (1) ed8 26 | 27 | The `ed8 dictionary` is provided in YAML format and can be applied via 28 | the `quanteda` package. The dictionary and the R script `apply_ed8.R` to apply the 29 | dictionary to a data frame with a ‘text’ column can be found in the 30 | folder “./01_ed8”. 31 | 32 | ``` r 33 | # First, load quanteda package 34 | library(quanteda) 35 | 36 | # Load in dictionary 37 | ed8 <- dictionary(file = "./ed8.yml", 38 | format = "YAML") 39 | 40 | # Create the function 41 | get_ed8_emotions <- function(data){ 42 | #Create a corpus from your data frame 43 | corp <- corpus(data) 44 | 45 | #Tokenize corpus and pre-process (remove punctuations, numbers, and urls) 46 | toks <- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE, remove_url = TRUE) 47 | 48 | #Create DFM just to measure number of terms before removing stopwords 49 | terms_dfm <- dfm(toks) 50 | 51 | #Create bigram-compounds to include negation control 52 | toks_neg_bigram <- tokens_compound(toks, pattern = phrase("nicht *")) 53 | toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("nichts *")) 54 | toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("kein *")) 55 | toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("keine *")) 56 | toks_neg_bigram <- tokens_compound(toks_neg_bigram, pattern = phrase("keinen *")) 57 | 58 | #Turn tokens into DFM, remove stopwords 59 | emo_dfm <- dfm(toks_neg_bigram, remove = stopwords("de")) 60 | 61 | #Apply dictionary 62 | dict_dfm_results <- dfm_lookup(emo_dfm,ed8) 63 | 64 | #Convert results back to data frame 65 | results_df <- cbind(data, convert(dict_dfm_results, to = 'data.frame')) 66 | 67 | #Assign length to each documents 68 | results_df$terms_raw <- ntoken(terms_dfm) 69 | results_df$terms <- ntoken(emo_dfm) 70 | 71 | return(results_df) 72 | } 73 | 74 | # Now you can use the function on your data; simply enter a data frame with a column called "text" including the text data 75 | results <- get_ed8_emotions(data) 76 | 77 | # Finally, you can create normalized emotional scores by dividing the ed8-scores by document length 78 | results$anger.norm <- results$ed8.ANGER / results$terms 79 | results$fear.norm <- results$ed8.FEAR / results$terms 80 | results$disgust.norm <- results$ed8.DISGUST / results$terms 81 | results$sadness.norm <- results$ed8.SADNESS / results$terms 82 | results$joy.norm <- results$ed8.JOY / results$terms 83 | results$enthusiasm.norm <- results$ed8.ENTHUSIASM / results$terms 84 | results$pride.norm <- results$ed8.PRIDE / results$terms 85 | results$hope.norm <- results$ed8.HOPE / results$terms 86 | ``` 87 | 88 | ## (2) Neural Network Classifiers 89 | 90 | The neural network classifiers and locally trained word embedding model 91 | are provided in the folder “./02_neuralnet”. The code for turning text into 92 | numerical vectors and subsequently applying the neural network 93 | classifiers can be found in the R script `apply_neuralnet.R`. Remember, the machine learning models were trained on sentences, so you need to bring your text data on sentence level first. 94 | 95 | ``` r 96 | # Load necessary packages 97 | library(quanteda) 98 | library(corpus) 99 | library(keras) 100 | library(tidytext) 101 | 102 | # Set working directory 103 | setwd("./neuralnet") 104 | 105 | 106 | # First, you need to turn your text into sentences 107 | data <- data %>% 108 | unnest_tokens(sentences, text, "sentences") 109 | 110 | # Now, you can turn your text documents in a corpus 111 | corp <- corpus(data$sentences) 112 | 113 | # Create a document feature matrix and conduct pre-processing 114 | text_dfm <- dfm(corp, remove=stopwords("german"), verbose=TRUE, tolower = TRUE) 115 | 116 | # Stemming 117 | text_dfm <- dfm_wordstem(text_dfm, language = "german") 118 | 119 | # Now, we will convert the word embeddings into a data frame 120 | # and match the features from each document with their corresponding embeddings 121 | 122 | #F irst, we load the locally trained word embeddings into R 123 | w2v <- readr::read_delim("./vec_ed_preprocessed.txt", 124 | skip=1, delim=" ", quote="", 125 | col_names=c("word", paste0("V", 1:100))) 126 | 127 | # Stem the terms included in the embeddings to increase matches 128 | w2v$word <- text_tokens(w2v$word, stemmer = "de") 129 | 130 | # creating new feature matrix for embeddings 131 | embed <- matrix(NA, nrow=ndoc(text_dfm), ncol=100) 132 | for (i in 1:ndoc(cgdfm)){ 133 | if (i %% 100 == 0) message(i, '/', ndoc(text_dfm)) 134 | # extract word counts 135 | vec <- as.numeric(text_dfm[i,]) 136 | # keep words with counts of 1 or more 137 | doc_words <- featnames(text_dfm)[vec>0] 138 | # extract embeddings for those words 139 | embed_vec <- w2v[w2v$word %in% doc_words, 2:101] 140 | # aggregate from word- to document-level embeddings by taking AVG 141 | embed[i,] <- colMeans(embed_vec, na.rm=TRUE) 142 | # if no words in embeddings, simply set to 0 143 | if (nrow(embed_vec)==0) embed[i,] <- 0 144 | } 145 | 146 | # After you created the sentence embeddings, you can apply the trained machine learning models for each emotion 147 | # The machine learning models are provided in the folder "./neuralnet/models" 148 | # for example, anger: 149 | 150 | model <- load_model_hdf5("./models/keras_anger", custom_objects = NULL, compile = TRUE) 151 | wb.anger <- model %>% predict_classes(embed) 152 | data <- cbind(data, wb.anger) 153 | ``` 154 | 155 | ## (3) ELECTRA Model 156 | 157 | The ELECTRA files are provided in the folder `./03_electra`. The model can 158 | be applied to text data using the Python code as shown in the Python notebook `apply_electra.ipynb`. The ELECTRA model was trained on sentences, so you need to bring your text data on sentence level first. 159 | 160 | 161 | ``` python 162 | # Set working directory 163 | %cd /03_electra/ 164 | ``` 165 | 166 | ``` python 167 | # load necessary modules 168 | import transformers 169 | import pandas as pd 170 | ``` 171 | 172 | ``` python 173 | # text documents the model will be applied to 174 | df = pd.read_csv('./data.csv') 175 | documents = data.text 176 | ``` 177 | 178 | ``` python 179 | # load inferencer 180 | from helper.inferencing import Inferencer 181 | ``` 182 | 183 | ``` python 184 | # predicting 185 | predictor = Inferencer() 186 | df_results = predictor.predict_dataframe(documents) 187 | ``` 188 | 189 | ``` python 190 | # show results 191 | df_results 192 | ``` 193 | --------------------------------------------------------------------------------